File swaptop.c of Package mmdebug-tools

/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2025 SUSE LLC */
#define _GNU_SOURCE
#include <ctype.h>
#include <dirent.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <ftw.h>
#include <limits.h>
#include <mntent.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <getopt.h>
#include <time.h>

#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>

struct swap_consumer
{
	char name[PATH_MAX];
	uint64_t private_size;
	uint64_t shared_size;
};

static struct swap_consumer *consumers;
static unsigned int nr_consumers;
static unsigned int consumer_capacity;
static uint64_t total_sysv, total_anon, total_file, total_shared;
static int no_pss, ignore_shared, do_json, continuous;

struct swap_consumer *swap_add(const char *name, uint64_t private_size, uint64_t shared_size)
{
	struct swap_consumer *sc;

	if (nr_consumers == (1U << consumer_capacity) || !consumers) {
		consumer_capacity++;
		consumers = reallocarray(consumers, (1U << consumer_capacity),
				         sizeof(struct swap_consumer));
		if (!consumers)
			err(1, "swap_add");
	}

	sc = &consumers[nr_consumers++];
	strcpy(sc->name, name);
	sc->private_size = private_size;
	sc->shared_size = shared_size;
	return sc;
}

static bool is_pid(const char *name)
{
	while (*name) {
		if (!isdigit(*name++))
			return false;
	}

	return true;
}

static uint64_t open_parse_file(int dirfd, const char *filename, const char *entry)
{
	FILE *file;
	int fd;
	char *line = NULL;
	uint64_t val;
	size_t line_len, entry_len = strlen(entry);

	fd = openat(dirfd, filename, O_RDONLY);
	if (fd < 0)
		return 0;

	file = fdopen(fd, "r");
	if (!file) {
		warn("fdopen");
		close(fd);
		return 0;
	}

	while (getline(&line, &line_len, file) != -1) {
		if (!strncmp(line, entry, entry_len)) {
			val = strtoll(line + entry_len, NULL, 10);
			goto out;
		}
	}

	val = 0;
	warnx("entry '%s' not found in file %s", entry, filename);
out:
	free(line);
	fclose(file);
	return val;
}

static bool looks_like_header(const char *line)
{
	/* Normal entries are vaguely described by '([a-zA-Z]|_)*:'
	 * Anything that isn't matched by that is a header */
	const char *s;
	char c;

	for (s = line; *s; s++) {
		c = *s;
		if (c == ':')
			return false;
		if (!isalpha(c) && c != '_')
			break;
	}

	return true;
}

static uint64_t parse_smaps(int dirfd, int no_pss, uint64_t *shared_size)
{
	FILE *file;
	int fd;
	char *line = NULL;
	uint64_t swap = 0, this_swap = 0, this_pss = 0;
	bool is_shared = false;
	size_t line_len;

	fd = openat(dirfd, "smaps", O_RDONLY);
	if (fd < 0)
		return 0;

	file = fdopen(fd, "r");
	if (!file) {
		warn("fdopen");
		close(fd);
		return 0;
	}

	while (getline(&line, &line_len, file) != -1) {
		if (looks_like_header(line)) {
			continue;
		}

		if (!strncmp(line, "Swap:", strlen("Swap:"))) {
			this_swap = strtoll(line + strlen("Swap:"), NULL, 10);
			continue;
		}
		
		if (!strncmp(line, "SwapPss:", strlen("SwapPss:"))) {
			this_pss = strtoll(line + strlen("SwapPss:"), NULL, 10);
			continue;
		}

		if (!strncmp(line, "VmFlags:", strlen("VmFlags:"))) {
			/* Look for the shared flag. If it is shared, SwapPss: is meaningless, and
			 * Swap: _can_ be meaningful, for shared shmem mappings. */
			is_shared = strstr(line, " sh ") != NULL;

			/* VmFlags: is the last entry in the smaps, take this opportunity to
			 * account swap */
			if (is_shared)
				total_shared += this_swap << 10;
			if (is_shared && ignore_shared)
				continue;

			/* Note: For shmem mappings, the kernel (as of 6.17) accounts swap, but
			 * not swap pss. This means that blindly relying on SwapPss for shared
			 * mappings will give wildly different numbers. */
			if (!is_shared && !no_pss)
				this_swap = this_pss;
			if (is_shared)
				*shared_size += this_swap;
			else
				swap += this_pss;
			this_swap = this_pss = 0;
		}
	}

out:
	free(line);
	fclose(file);
	*shared_size <<= 10;
	return swap << 10;

}

static void swap_check_proc(int dirfd, const char *pidstr)
{
	int fd, fd2;
	uint64_t swap, shared_swap;
	char comm[48] = {0};
	int comm_len;

	fd = openat(dirfd, pidstr, O_RDONLY | O_DIRECTORY);
	if (fd < 0) {
		/* Can easily fail if a process is going away, don't complain */
		return;
	}
#ifdef SMAPS_ROLLUP
	swap = open_parse_file(fd, "smaps_rollup", no_pss ? "Swap:" : "SwapPss:") << 10;
#else
	shared_swap = 0;
	swap = parse_smaps(fd, no_pss, &shared_swap);
#endif
	if (swap + shared_swap > 0) {
		fd2 = openat(fd, "comm", O_RDONLY);
		if (fd2 < 0)
			goto out;
		comm_len = read(fd2, comm, sizeof(comm));
	       	if (comm_len < 0) {
			close(fd);
			goto out;
		}
		/* comm comes with a trailing newline */
		comm[comm_len - 1] = ' ';
		sprintf(comm + comm_len - 1, "[%s]", pidstr);
		close(fd2);
		total_anon += swap;
		swap_add(comm, swap, shared_swap);
	}

out:
	close(fd);
}

static void list_procs(void)
{
	struct dirent *dirent;
	DIR *dir = opendir("/proc");

	if (!dir)
		err(1, "Opening /proc failed");

	while ((dirent = readdir(dir)) != NULL) {
		if (!is_pid(dirent->d_name))
			continue;
		swap_check_proc(dirfd(dir), dirent->d_name);
	}

	closedir(dir);
}

static const char * const swapping_fses[] = {
	"tmpfs",
	"devtmpfs",
};

static bool fs_may_swap(const char *fs_type)
{
	/* Some filesystems (particularly those based on shmemfs) may swap.
	 * As far as I'm aware, there's no way to properly detect this. As such,
	 * we must hardcode this in a fuzzy, best-effort way. */
	unsigned int i;

	for (i = 0; i < sizeof(swapping_fses) / sizeof(swapping_fses[0]); i++) {
		if (!strcmp(fs_type, swapping_fses[i]))
			return true;
	}

	return false;
}

#ifndef SYS_cachestat
/* portable across uapis/architectures */
#define SYS_cachestat 451
#endif

struct cachestat_range {
	uint64_t off;
	uint64_t len;
};

struct cachestat {
	uint64_t nr_cache;
	uint64_t nr_dirty;
	uint64_t nr_writeback;
	uint64_t nr_evicted;
	uint64_t nr_recently_evicted;
};


int cachestat(unsigned int fd, struct cachestat_range *range, struct cachestat *cstat,
	      unsigned int flags)
{
	static int has_cachestat = -1;

	if (has_cachestat != 0) {
		if (syscall(SYS_cachestat, fd, range, cstat, flags) < 0) {
			if (errno == ENOSYS)
				has_cachestat = 0;
			return -1;
		}

		return 0;
	}

	errno = ENOSYS;
	return -1;
}

static bool is_in_file_hole(int fd, struct stat *buf, off_t off)
{
	static ino_t cached_ino;
	static dev_t cached_dev;
	static off_t hole_start, hole_end;
	static bool has_cache = false;

	/* Invalidate the cache if we're not looking at the same file */
	if (buf->st_ino != cached_ino || buf->st_dev != cached_dev)
		has_cache = false;

	/* ... or if we went past the hole already */
	if (has_cache && off >= hole_end)
		has_cache = false;

	if (!has_cache) {
		/* Lets (ab)use lseek SEEK_HOLE and SEEK_DATA. Something nicer like FIEMAP would be
		 * useful, but it doesn't work for shmemfs. */
		hole_start = lseek(fd, off, SEEK_HOLE);
		if (hole_start < 0)
			return false;
		hole_end = lseek(fd, hole_start, SEEK_DATA);
		if (hole_end < 0) {
			if (errno == ENXIO) {
				/* No more data. Calculate some large OFF_T_MAX and use that as the end of the hole */
				hole_end = sizeof(off_t) == 4 ? INT_MAX : LLONG_MAX;
			} else {
				return false;
			}
		}

		cached_ino = buf->st_ino;
		cached_dev = buf->st_dev; 	
		has_cache = true;
	}

	return off >= hole_start && off < hole_end;
}

static void scan_file(const char *path, int fd, struct stat *buf)
{
	int err;
	unsigned long page_size = sysconf(_SC_PAGESIZE), mincore_pgs, j;
	void *ptr, *fbuf;
	off_t curr_off;
	struct cachestat_range range = {0, 0};
	struct cachestat cstat;
	static unsigned char vec[8192];
	uint64_t i = 0, nr_pages = (buf->st_size + page_size - 1) / page_size;

	/* No size, no pages */
	if (buf->st_size == 0)
		return;

	err = cachestat(fd, &range, &cstat, 0);
	if (!err) {
		if (cstat.nr_evicted > 0) {
			total_file += cstat.nr_evicted * page_size;
			swap_add(path, cstat.nr_evicted * page_size, 0);
		}
		return;
	}

	/* TODO: This doesn't work well for 32-bit systems, with LFS... We should chunk
	 * this, like we do for mincore() calls. */
	fbuf = ptr = mmap(NULL, buf->st_size, PROT_READ, MAP_SHARED | MAP_NORESERVE, fd, 0);
	if (ptr == MAP_FAILED)
		return;

	/* Now we have the whole thing mapped, lets employ mincore() + SEEK_DATA creatively.
	 * For shmemfs, we know that non-present pages (note that mincore() does not merely
	 * check the PTEs, but also the page cache itself) are either swapped-out
	 * folios, or file holes. mincore() itself does not make the distinction. As such,
	 * we employ SEEK_DATA when we find a !present page (which is something that shmemfs
	 * does implement). Non-present pages between [first non-present, SEEK_DATA(
	 * will then end up being actual file holes, and not swapped out pages. */

	cstat.nr_evicted = 0;
	while (i < nr_pages) {
		mincore_pgs = nr_pages - i > 8192 ? 8192 : nr_pages - i;
		err = mincore(fbuf, mincore_pgs * page_size, vec);
		if (err) {
			warn("mincore");
			goto out;
		}


		for (j = 0; j < mincore_pgs; j++) {
			if (!vec[j]) {
				/* Not present. Do we know if we are in a file hole? */
				curr_off = fbuf - ptr + j * page_size;
				if (!is_in_file_hole(fd, buf, curr_off))
					cstat.nr_evicted++;
			}
		}

		i += mincore_pgs;
		fbuf += mincore_pgs * page_size;
	}

	if (cstat.nr_evicted > 0) {
		total_file += cstat.nr_evicted * page_size;
		swap_add(path, cstat.nr_evicted * page_size, 0);
	}
out:
	munmap(ptr, buf->st_size);
}

static int scan_file_cb(const char *fpath, const struct stat *sb, int typeflag, struct FTW *ftwbuf)
{
	int fd = -1, fd2;
	struct stat buf;
	char path[NAME_MAX];

	if (typeflag == FTW_F)
	{
		/* Open the file as O_PATH to get a permanent reference to it. If it looks ok, open
		 * /proc/self/fd/<fd> - this will let us complete the path lookup, while making sure it's
		 * the same file */
		fd = open(fpath, O_PATH | O_NOFOLLOW);
		if (fd < 0)
			goto out;
		/* Double check that we're not racing with something. So after grabbing an
		 * fd, recheck its stat() information. We do not want to be tricked into
		 * opening a device file, or something else with fun possible side effects. */
		if (fstat(fd, &buf) < 0) {
			warn("scan_file_cb: fstat");
			goto out;
		}

		if (buf.st_ino != sb->st_ino ||
		    !S_ISREG(buf.st_mode))
			goto out;
		/* Note that NAME_MAX is way more than enough space for this path */
		snprintf(path, sizeof(path), "/proc/self/fd/%d", fd);
		/* We don't need to pass O_NOFLLOW anymore, this is not a symlink */
		fd2 = open(path, O_RDONLY);
		if (fd2 < 0)
			goto out;
		close(fd);
		fd = fd2;
		scan_file(fpath, fd, &buf);
	}

out:
	if (fd != -1)
		close(fd);
	return 0;
}

static void scan_fs(const char *mntpoint)
{
	/* XXX arbitrary fd limit? */
	nftw(mntpoint, scan_file_cb, 400, FTW_PHYS | FTW_MOUNT);
}

static void list_shmemfs(void)
{
	FILE *file;
	struct mntent *mnt;

	file = setmntent("/proc/self/mounts", "r");
	if (!file)
		err(1, "Could not open /proc/self/mounts");


	while ((mnt = getmntent(file)) != NULL) {
		if (fs_may_swap(mnt->mnt_type))
			scan_fs(mnt->mnt_dir);
	}

	endmntent(file);
}

static void list_sysvshm(void)
{
	FILE *file;
	char *line = NULL;
	size_t n;
	bool first = true;
	uint64_t swap;
	int key, shmid;
	char namebuf[64];

	file = fopen("/proc/sysvipc/shm", "r");
	/* May not be enabled */
	if (!file)
		return;

	while (getline(&line, &n, file) != -1) {
		/* Discard the first line (header) */
		if (first) {
			first = false;
			continue;
		}

		/* key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap */
		if (sscanf(line, "%d %d %*llu %*llu %*llu %*llu %*llu %*llu %*llu %*llu %*llu %*llu %*llu %*llu %*llu %llu\n", &key, &shmid, &swap) != 3) {
			warnx("sysvipc/shm format error");
			continue;
		}

		if (swap > 0) {
			snprintf(namebuf, sizeof(namebuf), "sysv-shm[key=%d, shmid=%d]", key, shmid);
			total_sysv += swap;
			swap_add(namebuf, swap, 0);
		}
	}

	free(line);
	fclose(file);
}

static int interactive, term_rows, term_cols, do_summary;

static uint64_t get_swapcache(void)
{
	return open_parse_file(AT_FDCWD, "/proc/meminfo", "SwapCached:");
}

#define max(a, b)              \
({                             \
 	__auto_type __a = (a); \
	__auto_type __b = (b); \
	__a > __b ? __a : __b; \
})

#define min(a, b)              \
({                             \
 	__auto_type __a = (a); \
	__auto_type __b = (b); \
	__a < __b ? __a : __b; \
})

static void free_swap_info(void)
{
	free(consumers);
	consumers = NULL;
	nr_consumers = 0;
	consumer_capacity = 0;
	total_anon = total_shared = total_file = total_sysv = 0;
}

static void print_swap(void)
{
	struct swap_consumer *sc;
	unsigned int i;
	unsigned int to_print;
	uint64_t total = 0;
	uint64_t private, shared;
	unsigned int col_start = strlen("Swap user");
	unsigned int col_widths[3] = {
		[0 /* total */] = ((strlen("Total swap") + 16) & -8) - 3,
		[1 /* private */] = ((strlen("Private swap") + 16) & -8) - 3,
		[2 /* shared */] = ((strlen("Shared swap") + 16) & -8) - 3,
	};

	for (i = 0; i < nr_consumers; i++) {
		sc = &consumers[i];
		col_start = max(col_start, strlen(sc->name));
		total += sc->private_size + sc->shared_size;
	}

	if (interactive) {
		/* Reverse bg and fg for the headers, erase the current line (so everything gets
		 * properly filled */
		printf("\033[7m");
		printf("%*s\033[G", term_cols, "");
	}

	printf("Total swap used: %llu kB\tAnon: %llu kB\tShared: %llu kB\tFile: %llu kB\tSysvShm: %llu kB\tSwapcache: %llu kB\n",
			total >> 10, total_anon >> 10, total_shared >> 10, total_file >> 10, total_sysv >> 10, get_swapcache());

	if (do_summary)
		return;

	if (interactive) {
		/* Restrict printing to whatever fits on the screen. We reserve two lines for the top rows
		 * and a line for the bottom row (newline). */
		to_print = min(nr_consumers, term_rows - 3);
	} else {
		to_print = nr_consumers;
	}

	/* Align col_start to a tab */
	col_start = (col_start + 15) & -8;
	if (interactive) {
		/* Erase line with proper fg and bg reversal */
		printf("%*s\033[G", term_cols, "");
	}

	printf("Swap user%*sTotal swap\t\tPrivate swap\t\tShared swap\n", col_start - strlen("Swap user"), "");

	if (interactive)
		printf("\033[27m");
	
	for (i = 0; i < to_print; i++) {
		sc = &consumers[i];
		private = sc->private_size;
		shared = sc->shared_size;
		printf("%-*s%20llu kB%*s%20llu kB%*s%20llu kB\n", col_start, sc->name, (private + shared) >> 10, col_widths[0] - 20, "",	
				private >> 10, col_widths[1] - 20, "", shared >> 10);
	}

	if (to_print == 0)
		printf("No swap users.\n");
}

#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))

static void print_json_str(const char *s)
{
	static const char may_escape_directly[] = {'\b', '\f', '\n', '\r', '\t', '\"', '\\'};
	int i;
	char c;

	for (; *s; s++) {
		c = *s;
		if (c < 32 || c == '"' || c == '\\') {
			for (i = 0; i < ARRAY_SIZE(may_escape_directly); i++) {
				if (c == may_escape_directly[i]) {
					printf("\\%c", c);
					goto cont;
				}
			}
			/* if we can't escape directly, output it as a UTF-16 codepoint */
			printf("\\u%04X", c);
cont:
			continue;
		}
		putchar(c);
	}
}

static void print_json(bool first)
{
	struct swap_consumer *sc;
	unsigned int i;
	uint64_t total = 0;

	if (!first) {
		/* If not the first, add a comma. JSON does not allow trailing commas. */
		putchar(',');
	}

	putchar('\n');
	/* open another array for this particular record */
	putchar('[');

	for (i = 0; i < nr_consumers; i++) {
		sc = &consumers[i];
		total += sc->private_size + sc->shared_size;
	}

	printf("{\"record\":\"record_start\", \"epoch\":\"%llu\", \"total\":\"%llu\", \"total_anon\":\"%llu\", \"total_shared\":\"%llu\", \"total_file\":\"%llu\","
			"\"total_sysv\":\"%llu\", \"swapcache\":\"%llu\"}",
			time(NULL), total >> 10, total_anon >> 10, total_shared >> 10, total_file >> 10, total_sysv >> 10, get_swapcache());

	for (i = 0; i < nr_consumers; i++) {
		sc = &consumers[i];
		putchar(',');
		printf("{\"record\":\"swap_entry\", \"name\":\"");
		/* JSON strings need to be escaped carefully, in order not to break parsing */
		print_json_str(sc->name);
		printf("\", \"private_swap\":\"%llu\", \"shared_swap\":\"%llu\"}", sc->private_size >> 10, sc->shared_size >> 10);
	}

	putchar(']');
	if (continuous)
		fflush(stdout);
}

static int swap_cmp(const void *p1, const void *p2)
{
	const struct swap_consumer *c1 = p1, *c2 = p2;
	uint64_t c1size = c1->private_size + c1->shared_size;
	uint64_t c2size = c2->private_size + c2->shared_size;

	if (c1size < c2size)
		return 1;
	else if (c1size > c2size)
		return -1;
	return 0;
}

static void sort_swap(void)
{
	qsort(consumers, nr_consumers, sizeof(struct swap_consumer), swap_cmp);
}

const struct option long_options[] = {
	{"help", no_argument, NULL, 'h'},
	{"version", no_argument, NULL, 'v'},
	{"summary", no_argument, &do_summary, 1},
	{"no-pss", no_argument, &no_pss, 1},
	{"ignore-shared", no_argument, &ignore_shared, 1},
	{"json", no_argument, &do_json, 1},
	{"continuous", no_argument, &continuous, 1},
	{"no-continuous", no_argument, &continuous, 0},
	{"interval", required_argument, NULL, 'i'},
	{ 0 },
};

void show_help(int flag)
{
    /* Return 1 if it was an invalid flag. */
    int ret = flag == '?';

    printf("Usage:\n   swaptop [OPTIONS]\n"
	   "Account and list swap users (processes, tmpfs files, etc) in a detailed fashion.\n\n"
	   "Options:\n"
           "   -h/--help                 print help and exit\n"
           "   -v/--version              print version and exit\n"
	   "   -s/--summary              print a top-view summary and exit\n"
	   "   -p/--no-pss               take the total pinned swap into account, not PSS\n"
	   "   -j/--json                 output in json\n"
	   "   -c/--continuous           output continuously (default if stdin and stdout are a tty)\n"
	   "   --no-continuous           do not output continuously\n"
	   "   --ignore-shared           ignore shared mappings when accounting process swap usage\n"
	   "   -i/--interval SECS        if continuous, measure every SECS seconds. Defaults to 5 seconds\n");
    exit(ret);
}

void show_version()
{
    printf("swaptop built on %s\n", __DATE__);
    exit(0);
}

int main(int argc, char **argv)
{
	int indexptr = 0;
	int interval = 5;
	long val;
	bool first = true;
	char flag, *endptr;

	/* By default, do interactive mode if stdin&out are a tty */
	interactive = isatty(STDOUT_FILENO) && isatty(STDIN_FILENO);

	if (interactive) {
		/* Get the number of rows in the terminal. If not available, default to 80x25 */
		struct winsize ws;
		if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws) < 0 || ws.ws_row <= 25 || ws.ws_col <= 80) {
			ws.ws_row = 25;
			ws.ws_col = 80;
		}
		term_rows = ws.ws_row;
		term_cols = ws.ws_col;
	}

	while ((flag = getopt_long(argc, argv, "vhspjci:", long_options, &indexptr)) != -1) {
		switch (flag)
		{
			case '?':
			case 'h':
				show_help(flag);
				break;
			case 'v':
				show_version();
				break;
			case 's':
				do_summary = 1;
				break;
			case 'p':
				no_pss = 1;
				break;
			case 'j':
				do_json = 1;
				break;
			case 'c':
				continuous = 1;
				break;
			case 'i':
				errno = 0;
				val = strtol(optarg, &endptr, 10);
				if (errno != 0 || endptr != optarg + strlen(optarg) || val > INT_MAX || val < INT_MIN) {
					fprintf(stderr, "swaptop: error converting %s to an integer\n", optarg);
					show_help('?');
				}
				break;
		}
	}

	if (do_summary)
		interactive = 0;

	if (interactive)
		continuous = 1;

	if (do_json) {
		/* open the array */
		putchar('[');
		interactive = 0;
	}

	for (;;) {
		list_procs();
		list_shmemfs();
		list_sysvshm();
		sort_swap();
		if (interactive) {
			/* clear the screen, if interactive */
			printf("\033[2J\033[H");
			fflush(stdout);
		}

		if (do_json)
			print_json(first);
		else
			print_swap();
		free_swap_info();
		first = false;
		if (!continuous)
			break;
		sleep(interval);
	}

	if (do_json) {
		/* close the array, if we can */
		printf("]\n");
	}
}
openSUSE Build Service is sponsored by