/* Steve, the jobserver
 * (c) 2025 Michał Górny
 * SPDX-License-Identifier: GPL-2.0-or-later
 *
 * Inspired by CUSE example, nixos-jobserver (draft) and guildmaster:
 * https://github.com/libfuse/libfuse/blob/f58d4c5b0d56116d8870753f6b9d1620ee082709/example/cuse.c
 * https://github.com/RaitoBezarius/nixpkgs/blob/e97220ecf1e8887b949e4e16547bf0334826d076/pkgs/by-name/ni/nixos-jobserver/nixos-jobserver.cpp#L213
 * https://codeberg.org/amonakov/guildmaster/
 */

#define FUSE_USE_VERSION 31

#include <cassert>
#include <chrono>
#include <cstddef>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cerrno>
#include <climits>
#include <cmath>
#include <csignal>
#include <algorithm>
#include <deque>
#include <functional>
#include <memory>
#include <optional>
#include <print>
#include <string>
#include <string_view>
#include <unordered_map>
#include <variant>

#include <getopt.h>
#include <grp.h>
#include <pwd.h>
#include <sys/poll.h>
#include <sys/syscall.h>
#include <unistd.h>

#include <event2/event.h>

#include <cuse_lowlevel.h>
#include <fuse.h>
#include <fuse_opt.h>

#include "steve.h"
#include "util.hxx"

struct steve_waiter {
	std::variant<fuse_req_t, fuse_pollhandle *> handle;
	uint64_t pid;

	steve_waiter(fuse_req_t new_req, uint64_t new_pid)
		: handle(new_req), pid(new_pid) {}
	steve_waiter(fuse_pollhandle *new_poll_handle, uint64_t new_pid)
		: handle(new_poll_handle), pid(new_pid) {}

	steve_waiter(const steve_waiter &) = delete;
	steve_waiter& operator=(const steve_waiter &) = delete;

	steve_waiter(steve_waiter &&other)
		: handle(other.handle), pid(other.pid)
	{
		other.handle = static_cast<fuse_pollhandle *>(nullptr);
	}
	steve_waiter& operator=(steve_waiter &&other) {
		handle = other.handle;
		pid = other.pid;
		other.handle = static_cast<fuse_pollhandle *>(nullptr);
		return *this;
	}

	~steve_waiter() {
		if (fuse_pollhandle **poll_handle = std::get_if<fuse_pollhandle *>(&handle)) {
			if (*poll_handle)
				fuse_pollhandle_destroy(*poll_handle);
		}
	}
};

struct steve_job_info {
	std::chrono::time_point<std::chrono::steady_clock> start_time;
};

typedef std::unique_ptr<struct event, std::function<void(struct event*)>> event_ptr;
typedef std::unordered_map<uint64_t, steve_job_info> running_job_map;

struct steve_process {
	int pid_fd{-1};
	ssize_t tokens_held{0};
	bool token_reserved{false};
	event_ptr pidfd_event;
	std::optional<char> extra_token;
	running_job_map running_jobs;
	bool warned_incorrect_token{false};
	std::string cmdline;

	~steve_process() {
		if (pid_fd != -1)
			close(pid_fd);
	}
};

enum class steve_token_availability {
	available,
	no_tokens,
	load_exceeded,
	per_process_limit_exceeded,
	memory_use_exceeded,
};

struct steve_state {
	int retval{0};
	const char *dev_name{"steve"};
	bool verbose;
	int64_t jobs;
	int64_t min_jobs{1};
	int64_t per_process_limit;
	double max_load_avg{-1};  /* <= 0 implies no load average */
	double load_avg{-1};
	int64_t min_memory_avail{-1}; /* <= 0 implies none */
	int64_t memory_avail{-1};
	int64_t memory_total{-1};
	int64_t tokens;
	std::deque<steve_waiter> waiters;
	std::unordered_map<uint64_t, steve_process> processes;
	struct event_base *evb;
	int loadavg_fd{-2};
	int meminfo_fd{-2};
	struct timeval recheck_timeout{0, 500000};
	std::optional<steve_token_availability> recheck_triggered;
	event_ptr recheck_event{nullptr, event_free};
	struct fuse_session *session;
	/* keep a global buffer as an optimization */
	struct fuse_buf buf{};
};

static void steve_get_load(steve_state *state)
{
	if (state->loadavg_fd == -2) {
		state->loadavg_fd = open("/proc/loadavg", O_RDONLY);
		if (state->loadavg_fd == -1)
			perror("Unable to open /proc/loadavg, falling back to getloadavg()");
	}

	if (state->loadavg_fd != -1) {
		char buf[64];
		ssize_t rd = pread(state->loadavg_fd, buf, sizeof(buf) - 1, 0);

		if (rd >= 0) {
			buf[rd] = 0;

			char *begin = buf;
			for (int field = 0; *begin && field != 3; ++begin) {
				if (*begin == ' ')
					++field;
			}
			char *end = strchr(buf, '/');

			if (*begin && *end) {
				*end = 0;
				long load;
				/* Decrease by one to account for our process. */
				if (arg_to_long(begin, &load)) {
					state->load_avg = load - 1;
					return;
				}
			}

			std::print(stderr, "Parsing /proc/loadavg failed, value = {}\n", buf);
		} else
			perror("Reading /proc/loadavg failed, falling back to getloadavg()");

		close(state->loadavg_fd);
		state->loadavg_fd = -1;
	}

	if (getloadavg(&state->load_avg, 1) == -1) {
		static bool warned = false;
		if (!warned) {
			perror("getloadavg() failed, will ignore (further warnings will be suppressed)");
			warned = true;
		}
		/* to make it clear it failed */
		state->load_avg = -0.0;
	}
}

static std::optional<long> steve_get_meminfo(int fd, std::string_view label)
{
	char buf[4096] = {"\n"};
	ssize_t rd = pread(fd, &buf[1], sizeof(buf) - 2, 0);

	if (rd < 0) {
		perror("Reading /proc/meminfo failed, memory use will not be available");
		return {};
	}

	buf[rd + 1] = 0;
	char *match = strstr(buf, label.data());
	if (!match) {
		std::print(stderr, "Parsing /proc/meminfo failed: no {}\n", label.substr(1));
		return {};
	}
	match += label.size();
	match += strspn(match, " ");

	char *end = match + strspn(match, "0123456789");
	if (strncmp(end, " kB\n", 4)) {
		std::print(stderr, "Parsing /proc/meminfo failed: {} not suffixed by ' kB'\n", label);
		return {};
	}
	*end = 0;

	long val;
	if (!arg_to_long(match, &val)) {
		std::print(stderr, "Parsing /proc/meminfo failed: {} is not a valid positive long\n", match);
		return {};
	}

	return val;
}

static std::optional<long> steve_get_meminfo(steve_state *state, std::string_view label)
{
	if (state->meminfo_fd == -2) {
		state->meminfo_fd = open("/proc/meminfo", O_RDONLY);
		if (state->meminfo_fd == -1)
			perror("Unable to open /proc/meminfo, memory use will not be available");
	}

	if (state->meminfo_fd != -1) {
		auto maybe_val = steve_get_meminfo(state->meminfo_fd, label);
		if (maybe_val.has_value())
			return maybe_val;

		close(state->meminfo_fd);
		state->meminfo_fd = -1;
	}

	return {};
}

static void steve_get_memory_use(steve_state *state)
{
	auto maybe_val = steve_get_meminfo(state, "\nMemAvailable:");
	state->memory_avail = maybe_val.has_value() ? maybe_val.value() / 1024 : -1;
}

static steve_token_availability steve_can_give_token(steve_state *state, uint64_t pid)
{
	/* if there is a token reserved, we give it immediately (even if load is exceeded now) */
	if (state->processes[pid].token_reserved)
		return steve_token_availability::available;
	/* if the per-process limit is reached, do not provide a token */
	if (state->per_process_limit > 0 && state->processes[pid].tokens_held >= state->per_process_limit)
		return steve_token_availability::per_process_limit_exceeded;
	/* if min_jobs is not satisifed, we always give out a token */
	if (state->jobs < state->min_jobs + state->tokens)
		return steve_token_availability::available;
	/* if we have been throttled due to load, wait for it to time out */
	if (state->recheck_triggered) {
		assert(state->tokens > 0);
		return steve_token_availability::load_exceeded;
	}

	if (state->tokens <= 0)
		return steve_token_availability::no_tokens;
	if (state->max_load_avg > 0) {
		steve_get_load(state);
		if (state->load_avg > state->max_load_avg) {
			/* trigger a recheck if we don't have one now */
			assert(!state->recheck_triggered);
			if (evtimer_add(state->recheck_event.get(), &state->recheck_timeout) == -1)
				std::print(stderr, "failed to enable recheck timer\n");
			else
				state->recheck_triggered = steve_token_availability::load_exceeded;

			return steve_token_availability::load_exceeded;
		}
	}
	if (state->min_memory_avail > 0) {
		steve_get_memory_use(state);
		if (state->memory_avail < state->min_memory_avail) {
			/* trigger a recheck if we don't have one now */
			assert(!state->recheck_triggered);
			if (evtimer_add(state->recheck_event.get(), &state->recheck_timeout) == -1)
				std::print(stderr, "failed to enable recheck timer\n");
			else
				state->recheck_triggered = steve_token_availability::memory_use_exceeded;

			return steve_token_availability::memory_use_exceeded;
		}
	}

	return steve_token_availability::available;
}

static char steve_get_token_char(steve_process *process)
{
	int64_t job_num;

	if (process->extra_token.has_value()) {
		assert(process->running_jobs.empty());
		job_num = process->extra_token.value();
		process->extra_token.reset();
	} else {
		/* find first free token */
		/* TODO: can we optimize this? */
		for (job_num = 0; process->running_jobs.contains(job_num); ++job_num);
	}

	process->running_jobs.emplace(
		job_num, steve_job_info{std::chrono::steady_clock::now()}
	);
	return job_num & 0xFF;
}

static std::string steve_process_id(uint64_t pid, const steve_process *process) {
	if (process && !process->cmdline.empty())
		return std::format("PID {} ({})", pid, process->cmdline);
	return std::format("PID {}", pid);
}

static std::string steve_process_token_stats(
	const steve_process *process,
	bool include_reserved = true)
{
	assert(process);

	std::string ret = std::format("{} tokens held by process", process->tokens_held);
	if (include_reserved)
		ret += std::format(", token reserved: {}", process->token_reserved);
	return ret;
}

static std::string steve_token_stats(
	const steve_state *state,
	const steve_process *process,
	bool include_reserved = true,
	bool include_load_mem = true)
{
	assert(state);
	assert(process);

	std::string ret = std::format("{} left, {}",
		state->tokens, steve_process_token_stats(process, include_reserved));
	if (include_load_mem) {
		if (state->max_load_avg > 0)
			ret += std::format(", load average = {:.3} (limit: {})",
				state->load_avg, state->max_load_avg);
		if (state->min_memory_avail > 0)
			ret += std::format(", memory available = {} MiB (min: {} MiB)",
				state->memory_avail, state->min_memory_avail);
	}
	return ret;
}

static void steve_give_token(steve_state *state, fuse_req_t req, uint64_t pid)
{
	steve_process *process = &state->processes.at(pid);
	char token = steve_get_token_char(process);

	if (process->token_reserved) {
		process->tokens_held++;
		process->token_reserved = false;
		if (state->verbose)
			std::print(stderr, "Giving reserved token 0x{:02x} to {}, {}\n",
				token, steve_process_id(pid, process),
				steve_token_stats(state, process, false, false));
		fuse_reply_buf(req, &token, 1);
		return;
	}

	state->tokens--;
	process->tokens_held++;
	if (state->verbose)
		std::print(stderr, "Giving job token 0x{:02x} to {}, {}\n",
			token,
			steve_process_id(pid, process),
			steve_token_stats(state, process));
	fuse_reply_buf(req, &token, 1);
}

static void steve_reserve_token(steve_state *state, uint64_t pid)
{
	steve_process *process = &state->processes.at(pid);
	if (process->token_reserved)
		return;

	state->tokens--;
	process->token_reserved = true;
	if (state->verbose)
		std::print(stderr, "Reserving job token for {}, {}\n",
			steve_process_id(pid, process),
			steve_token_stats(state, process, false));

	/* TODO: we need to handle expiring reservations if client doesn't read */
}

static void steve_wake_waiters(steve_state *state)
{
	for (auto it = state->waiters.begin(); it != state->waiters.end();) {
		steve_token_availability token_available = steve_can_give_token(state, it->pid);
		switch (token_available) {
			case steve_token_availability::available:
				break;
			case steve_token_availability::per_process_limit_exceeded:
				++it;
				continue;
			default:
				return;
		}

		if (fuse_req_t *read_req = std::get_if<fuse_req_t>(&it->handle)) {
			/* read request */
			steve_give_token(state, *read_req, it->pid);
		} else if (fuse_pollhandle **poll_handle = std::get_if<fuse_pollhandle *>(&it->handle)) {
			/* poll request */
			steve_reserve_token(state, it->pid);
			if (state->verbose)
				std::print(stderr, "Notifying {} about POLLIN\n",
					steve_process_id(it->pid, &state->processes.at(it->pid)));
			fuse_lowlevel_notify_poll(*poll_handle);
		} else
			assert(0 && "invalid waiter");

		it = state->waiters.erase(it);
	}
}

static void steve_handle_pidfd(evutil_socket_t pid_fd, short, void *userdata) {
	steve_state *state = static_cast<steve_state *>(userdata);

	for (auto it = state->processes.begin(); it != state->processes.end(); ++it) {
		if (it->second.pid_fd == pid_fd) {
			uint64_t pid = it->first;

			/* remove all waiters */
			for (auto wit = state->waiters.begin(); wit != state->waiters.end();) {
				if (wit->pid != pid) {
					++wit;
					continue;
				}

				if (fuse_req_t *read_req = std::get_if<fuse_req_t>(&wit->handle)) {
					/* can we even have read waiters at this point? */
					fuse_reply_err(*read_req, EPIPE);
					if (state->verbose)
						std::print(stderr, "Cleaning up read waiter for {}\n",
							steve_process_id(pid, &it->second));
				} else if (fuse_pollhandle **poll_handle = std::get_if<fuse_pollhandle *>(&wit->handle)) {
					/* notify the poller, just in case */
					fuse_lowlevel_notify_poll(*poll_handle);
					if (state->verbose)
						std::print(stderr, "Cleaning up poll notification for {}\n",
							steve_process_id(pid, &it->second));
				} else
					assert(0 && "invalid waiter");

				wit = state->waiters.erase(wit);
			}

			/* return all tokens held */
			state->tokens += it->second.tokens_held;
			if (it->second.token_reserved)
				++state->tokens;
			if (state->verbose || it->second.tokens_held > 0) {
				std::print(stderr, "{} exited while holding {} tokens, token reserved: {}, "
						"{} tokens available after returning them\n",
						steve_process_id(pid, &it->second), it->second.tokens_held, it->second.token_reserved, state->tokens);
			}

			/* remove the process */
			state->processes.erase(it);
			/* if we have new tokens, wake the waiters */
			steve_wake_waiters(state);
			/* make sure the process wasn't readded */
			assert(state->processes.find(pid) == state->processes.end());
			return;
		}
	}

	assert(0 && "pidfd triggered for unknown process");
}

static void steve_init(void *userdata, struct fuse_conn_info *)
{
	steve_state *state = static_cast<steve_state *>(userdata);

	state->tokens = state->jobs;

	std::print(stderr, "steve running on /dev/{} for {} jobs\n", state->dev_name, state->jobs);
	if (state->max_load_avg > 0)
		std::print(stderr, "  tokens will be served with load average < {:.3}\n", state->max_load_avg);
	if (state->min_memory_avail > 0)
		std::print(stderr, "  tokens will be served with memory available >= {} MiB (out of {} MiB)\n",
			state->min_memory_avail, state->memory_total);
	if (state->max_load_avg > 0 || state->min_memory_avail > 0)
		std::print(stderr, "  with a recheck timeout of {} s {} us\n",
				state->recheck_timeout.tv_sec, state->recheck_timeout.tv_usec);
	if (state->min_jobs > 0)
		std::print(stderr, "  at least {} jobs will be always available\n", state->min_jobs);
	if (state->per_process_limit > 0)
		std::print(stderr, "  per-process limit set to {}\n", state->per_process_limit);
}

static void steve_destroy(void *userdata)
{
	steve_state *state = static_cast<steve_state *>(userdata);

	state->waiters.clear();
	state->processes.clear();
	if (state->loadavg_fd >= 0)
		close(state->loadavg_fd);
	if (state->buf.mem)
		free(state->buf.mem);
}

static void steve_open(fuse_req_t req, struct fuse_file_info *fi)
{
	const struct fuse_ctx *context = fuse_req_ctx(req);
	steve_state *state = static_cast<steve_state *>(fuse_req_userdata(req));

	/* pid is not available in release, so store it here */
	static_assert(sizeof(fi->fh) >= sizeof(context->pid));
	fi->fh = context->pid;

	auto process_it = state->processes.find(fi->fh);
	if (process_it != state->processes.end()) {
		if (state->verbose)
			std::print(stderr, "Device open again by {}\n",
				steve_process_id(fi->fh, &process_it->second));
		assert(process_it->second.pid_fd != -1);
		assert(process_it->second.pidfd_event);
		fuse_reply_open(req, fi);
		return;
	}

	std::string cmdline;
	std::string path = std::format("/proc/{}/cmdline", fi->fh);
	if (FILE *cmdline_file = fopen(path.c_str(), "r")) {
		cmdline.resize(128);
		size_t rd = fread(cmdline.data(), 1, cmdline.size(), cmdline_file);
		/* remove the final null terminator (if any) */
		if (cmdline[rd] == '\0')
			--rd;
		cmdline.resize(rd > 0 ? rd : 0);
		/* replace all NULs with spaces */
		std::replace(cmdline.begin(), cmdline.end(), '\0', ' ');
		fclose(cmdline_file);
	}

	if (state->verbose) {
		if (!cmdline.empty()) {
			std::print(stderr, "Device open by PID {} ({})\n", fi->fh, cmdline);
		} else
			std::print(stderr, "Device open by PID {} (process name unknown)\n", fi->fh);
	}

	int pid_fd = syscall(SYS_pidfd_open, context->pid, 0);
	if (pid_fd == -1) {
		std::print(stderr, "Unable to open pidfd for PID {}, rejecting to open: {}\n",
			fi->fh, strerror(errno));
		fuse_reply_err(req, EIO);
		return;
	}

	event_ptr pidfd_event{event_new(state->evb, pid_fd, EV_READ|EV_PERSIST, steve_handle_pidfd, state), event_free};
	if (!pidfd_event) {
		std::print(stderr, "Unable to allocate pidfd event for PID {}\n", fi->fh);
		close(pid_fd);
		fuse_reply_err(req, EIO);
		return;
	}
	if (event_add(pidfd_event.get(), nullptr) == -1) {
		std::print(stderr, "Unable to enable pidfd event for PID {}\n", fi->fh);
		close(pid_fd);
		fuse_reply_err(req, EIO);
		return;
	}

	steve_process *process = &state->processes[fi->fh];
	process->pid_fd = pid_fd;
	process->pidfd_event = std::move(pidfd_event);
	process->cmdline = std::move(cmdline);
	fuse_reply_open(req, fi);
}

static void steve_release(fuse_req_t req, struct fuse_file_info *fi)
{
	steve_state *state = static_cast<steve_state *>(fuse_req_userdata(req));

	if (state->verbose)
		std::print(stderr, "Device closed by {}\n",
			steve_process_id(fi->fh, &state->processes.at(fi->fh)));

	fuse_reply_err(req, 0);
}

static void steve_interrupt(fuse_req_t req, void *userdata)
{
	steve_state *state = static_cast<steve_state *>(userdata);

	fuse_reply_err(req, EINTR);
	for (auto it = state->waiters.begin(); it != state->waiters.end(); ++it) {
		if (fuse_req_t *read_req = std::get_if<fuse_req_t>(&it->handle)) {
			if (*read_req == req) {
				if (state->verbose)
					std::print(stderr, "Passed EINTR to {}\n",
						steve_process_id(it->pid, &state->processes.at(it->pid)));
				state->waiters.erase(it);
				break;
			}
		}
	}
}

static void steve_explain_no_token(
	steve_token_availability token_avail,
	const steve_state *state,
	uint64_t pid,
	const steve_process *process)
{
	switch (token_avail) {
		case steve_token_availability::load_exceeded:
			std::print(stderr, "Load exceeded while {} requested token, waiting, {} tokens free, "
					"{}, load average {:.3} >= {}\n",
					steve_process_id(pid, process), state->tokens,
					steve_process_token_stats(process, false),
					state->load_avg, state->max_load_avg);
			break;
		case steve_token_availability::memory_use_exceeded:
			std::print(stderr, "Memory use exceeded while {} requested token, waiting, {} tokens free, "
					"{}, memory available: {} MiB < {} MiB\n",
					steve_process_id(pid, process), state->tokens,
					steve_process_token_stats(process, false),
					state->memory_avail, state->min_memory_avail);
			break;
		case steve_token_availability::per_process_limit_exceeded:
			std::print(stderr, "{} exceeded per-process token limit, waiting, {} tokens free\n",
					steve_process_id(pid, process), state->tokens);
			break;
		case steve_token_availability::no_tokens:
			std::print(stderr, "No free job token for {}, waiting, {}\n",
					steve_process_id(pid, process), steve_process_token_stats(process, false));
			break;
		case steve_token_availability::available:
			assert(0 && "not reached");
			break;
	}
}

static void steve_read(
	fuse_req_t req, size_t size, off_t off, struct fuse_file_info *fi)
{
	steve_state *state = static_cast<steve_state *>(fuse_req_userdata(req));

	if (off != 0) {
		fuse_reply_err(req, EIO);
		return;
	}
	if (size == 0) {
		fuse_reply_buf(req, "", 0);
		return;
	}

	/* no need to support reading more than one token at a time */
	steve_token_availability token_avail = steve_can_give_token(state, fi->fh);
	if (token_avail == steve_token_availability::available) {
		steve_give_token(state, req, fi->fh);
		return;
	}

	if (fi->flags & O_NONBLOCK) {
		fuse_reply_err(req, EAGAIN);
		return;
	}

	state->waiters.emplace_back(steve_waiter{req, fi->fh});
	if (state->verbose)
		steve_explain_no_token(token_avail, state, fi->fh, &state->processes.at(fi->fh));
	fuse_req_interrupt_func(req, steve_interrupt, state);
}

static running_job_map::iterator steve_find_running_job(steve_process *process, char token)
{
	running_job_map::iterator ret = process->running_jobs.end();

	/* Find the first matching token */
	/* TODO: optimize this */
	for (auto it = process->running_jobs.begin(); it != process->running_jobs.end(); ++it) {
		if (static_cast<uint8_t>(it->first & 0xFF) == token) {
			if (ret == process->running_jobs.end() || it->second.start_time < ret->second.start_time)
				ret = it;
		}
	}
	if (ret != process->running_jobs.end())
		return ret;

	/* No matching token? We're dealing with a bad client, just return the oldest token. */
	for (auto it = process->running_jobs.begin(); it != process->running_jobs.end(); ++it) {
		if (ret == process->running_jobs.end() || it->second.start_time < ret->second.start_time)
			ret = it;
	}
	return ret;
}

static void steve_write(
	fuse_req_t req, const char *data, size_t size, off_t off,
	struct fuse_file_info *fi)
{
	steve_state *state = static_cast<steve_state *>(fuse_req_userdata(req));
	steve_process *process = &state->processes.at(fi->fh);

	if (off != 0) {
		fuse_reply_err(req, EIO);
		return;
	}
	if (size > SSIZE_MAX) {
		std::print(stderr, "Warning: {} tried to return more than SSIZE_MAX tokens\n",
				steve_process_id(fi->fh, process));
		fuse_reply_err(req, EFBIG);
		return;
	}

	/* workaround for https://github.com/medek/nasm-rs/issues/44 */
	if (process->tokens_held == 0 && size == 1) {
		assert(!process->extra_token.has_value());
		assert(process->running_jobs.empty());
		process->extra_token = data[0];
		std::print(stderr, "Warning: {} pre-released an unacquired token 0x{:02x}, please report a bug upstream\n",
				steve_process_id(fi->fh, process), data[0]);
	} else {
		if (process->tokens_held < static_cast<ssize_t>(size)) {
			std::print(stderr, "Warning: {} tried to return {} tokens while holding only {} tokens, capping\n",
					steve_process_id(fi->fh, process), size, process->tokens_held);
			if (process->tokens_held < 0)
				size = 0;
			else
				size = process->tokens_held;
		}

		/* Finish the running jobs */
		std::chrono::time_point<std::chrono::steady_clock> current_time =
			std::chrono::steady_clock::now();
		for (const char *token = data; token < data + size; ++token) {
			auto it = steve_find_running_job(process, *token);

			if (static_cast<uint8_t>(it->first & 0xFF) != *token && !process->warned_incorrect_token) {
				std::print(stderr, "Warning: {} returned incorrect token value 0x{:02x}, please report a bug upstream\n",
						steve_process_id(fi->fh, process), *token);
				process->warned_incorrect_token = true;
			}
			if (state->verbose)
				std::print(stderr, "{} job 0x{:02x} finished after {}\n",
					steve_process_id(fi->fh, process), it->first,
					std::chrono::duration<double>(current_time - it->second.start_time));

			process->running_jobs.erase(it);
		}
	}
	if (size == 0) {
		fuse_reply_err(req, ENOSPC);
		return;
	}

	state->tokens += size;
	process->tokens_held -= size;
	if (state->verbose)
		std::print(stderr, "{} returned {} tokens, {} available now, {}\n",
			steve_process_id(fi->fh, process), size, state->tokens,
			steve_process_token_stats(process));
	fuse_reply_write(req, size);

	/* Since we have jobs now, see if anyone's waiting */
	steve_wake_waiters(state);
}

static void steve_poll(
	fuse_req_t req, struct fuse_file_info *fi, struct fuse_pollhandle *ph)
{
	steve_state *state = static_cast<steve_state *>(fuse_req_userdata(req));
	int events = fi->poll_events & (POLLIN | POLLOUT);

	/* POLLOUT is always possible, POLLIN only if we have any tokens */
	steve_token_availability token_avail = steve_can_give_token(state, fi->fh);
	if (token_avail != steve_token_availability::available) {
		state->waiters.emplace_back(ph, fi->fh);
		events &= ~POLLIN;
	}

	if (state->verbose) {
		const steve_process *process = &state->processes.at(fi->fh);
		if (token_avail == steve_token_availability::available)
			std::print(stderr, "{} requested poll, {} tokens available, {}\n",
					steve_process_id(fi->fh, process), state->tokens,
					steve_process_token_stats(process, false));
		else
			steve_explain_no_token(token_avail, state, fi->fh, process);
	}

	fuse_reply_poll(req, events);
}

static void steve_timeout_to_timeval(struct timeval *out, double timeout) {
	out->tv_sec = trunc(timeout);
	out->tv_usec = round((timeout - out->tv_sec) * 1000000);
}

static void steve_ioctl(
	fuse_req_t req, int cmd, void *, fuse_file_info *fi,
	unsigned, const void *in_buf, size_t, size_t)
{
	steve_state *state = static_cast<steve_state *>(fuse_req_userdata(req));
	/* FUSE uses the wrong type, sigh */
	unsigned ioctl_num = cmd;

	steve_process *process = &state->processes.at(fi->fh);
	if (state->verbose)
		std::print(stderr, "{} requested ioctl 0x{:08x}\n",
			steve_process_id(fi->fh, process), ioctl_num);

	int64_t val;
	double dval;
	if (STEVE_IOC_IS_SET(ioctl_num)) {
		switch (ioctl_num) {
			case STEVE_IOC_SET_LOAD_AVG:
			case STEVE_IOC_SET_LOAD_RECHECK_TIMEOUT: {
				const double *in_val = static_cast<const double *>(in_buf);
				dval = *in_val;
				break;
			}
			default: {
				const int64_t *in_val = static_cast<const int64_t *>(in_buf);
				if (*in_val < 0 || *in_val >= INT_MAX) {
					fuse_reply_err(req, EINVAL);
					return;
				}
				val = *in_val;
			}
		}
	}

	switch (ioctl_num) {
		case STEVE_IOC_GET_TOKENS:
			val = state->tokens;
			fuse_reply_ioctl(req, 0, &val, sizeof(val));
			break;
		case STEVE_IOC_GET_JOBS:
			val = state->jobs;
			fuse_reply_ioctl(req, 0, &val, sizeof(val));
			break;
		case STEVE_IOC_GET_LOAD_AVG:
			dval = state->max_load_avg;
			fuse_reply_ioctl(req, 0, &dval, sizeof(dval));
			break;
		case STEVE_IOC_GET_MIN_JOBS:
			val = state->min_jobs;
			fuse_reply_ioctl(req, 0, &val, sizeof(val));
			break;
		case STEVE_IOC_GET_MIN_MEMORY_AVAIL:
			val = state->min_memory_avail;
			fuse_reply_ioctl(req, 0, &val, sizeof(val));
			break;
		case STEVE_IOC_GET_LOAD_RECHECK_TIMEOUT:
			dval = state->recheck_timeout.tv_sec + (
				state->recheck_timeout.tv_usec / 1000000.
			);
			fuse_reply_ioctl(req, 0, &dval, sizeof(dval));
			break;
		case STEVE_IOC_GET_PER_PROCESS_LIMIT:
			val = state->per_process_limit;
			fuse_reply_ioctl(req, 0, &val, sizeof(val));
			break;
		case STEVE_IOC_SET_JOBS:
			if (val == 0)
				val = sysconf(_SC_NPROCESSORS_ONLN);
			state->tokens += val - state->jobs;
			state->jobs = val;
			std::print(stderr, "{} set jobs to {}\n", steve_process_id(fi->fh, process), state->jobs);
			if (state->verbose)
				std::print(stderr, "  new token availability: {}\n", state->tokens);
			if (state->min_jobs > state->jobs) {
				state->min_jobs = state->jobs;
				if (state->verbose)
					std::print(stderr, "  capping min-jobs to {}\n", state->min_jobs);
			}
			fuse_reply_ioctl(req, 0, nullptr, 0);
			steve_wake_waiters(state);
			break;
		case STEVE_IOC_SET_MIN_JOBS:
			if (val > state->jobs) {
				fuse_reply_err(req, EINVAL);
				return;
			}
			state->min_jobs = val;
			std::print(stderr, "{} set min-jobs to {}\n", steve_process_id(fi->fh, process), state->min_jobs);
			fuse_reply_ioctl(req, 0, nullptr, 0);
			steve_wake_waiters(state);
			break;
		case STEVE_IOC_SET_MIN_MEMORY_AVAIL:
			if (state->memory_total == -1) {
				auto memtotal = steve_get_meminfo(state, "\nMemTotal:");
				if (memtotal.has_value())
					state->memory_total = memtotal.value() / 1024;
			}
			if (val > state->memory_total) {
				fuse_reply_err(req, EINVAL);
				return;
			}
			state->min_memory_avail = val;
			std::print(stderr, "{} set min. available memory to {} MiB\n", steve_process_id(fi->fh, process), state->min_memory_avail);
			fuse_reply_ioctl(req, 0, nullptr, 0);
			steve_wake_waiters(state);
			break;
		case STEVE_IOC_SET_PER_PROCESS_LIMIT:
			state->per_process_limit = val;
			std::print(stderr, "{} set per-process limit to {}\n", steve_process_id(fi->fh, process), state->per_process_limit);
			fuse_reply_ioctl(req, 0, nullptr, 0);
			steve_wake_waiters(state);
			break;
		case STEVE_IOC_SET_LOAD_AVG:
			if (dval < 1) {
				fuse_reply_err(req, EINVAL);
				return;
			}
			state->max_load_avg = dval;
			std::print(stderr, "{} set load-average to {}\n", steve_process_id(fi->fh, process), state->max_load_avg);
			fuse_reply_ioctl(req, 0, nullptr, 0);
			steve_wake_waiters(state);
			break;
		case STEVE_IOC_SET_LOAD_RECHECK_TIMEOUT:
			if (dval < 0.000001 || dval > INT_MAX) {
				fuse_reply_err(req, EINVAL);
				return;
			}
			steve_timeout_to_timeval(&state->recheck_timeout, dval);
			std::print(stderr, "{} set load-recheck-timeout to {} s {} us\n",
					steve_process_id(fi->fh, process), state->recheck_timeout.tv_sec, state->recheck_timeout.tv_usec);
			fuse_reply_ioctl(req, 0, nullptr, 0);
			/* TODO: reset the event? */
			break;
		default:
			fuse_reply_err(req, ENOTTY);
	}
}

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
static const struct cuse_lowlevel_ops steve_ops = {
	.init = steve_init,
	.destroy = steve_destroy,
	.open = steve_open,
	.read = steve_read,
	.write = steve_write,
	.release = steve_release,
	.ioctl = steve_ioctl,
	.poll = steve_poll,
};
#pragma GCC diagnostic pop

static void steve_handle_sigusr1(evutil_socket_t, short, void *userdata) {
	steve_state *state = static_cast<steve_state *>(userdata);
	std::chrono::time_point<std::chrono::steady_clock> current_time =
		std::chrono::steady_clock::now();

	std::print(stderr, "steve: currently {} tokens available out of {}\n",
			state->tokens, state->jobs);
	for (const auto &it : state->processes) {
		std::print(stderr, "{} holds {} tokens:\n",
			steve_process_id(it.first, &it.second), it.second.tokens_held);
		for (const auto &jt : it.second.running_jobs)
			std::print(stderr, "  job 0x{:02x} running for {}\n", jt.first,
				std::chrono::duration<double>(current_time - jt.second.start_time));
	}
}

static void steve_handle_cuse(evutil_socket_t, short, void *userdata) {
	steve_state *state = static_cast<steve_state *>(userdata);

	int rd = fuse_session_receive_buf(state->session, &state->buf);
	if (rd <= 0 && rd != -EINTR && rd != -EAGAIN) {
		state->retval = 1;
		if (rd == 0)
			std::print(stderr, "EOF from CUSE, exiting (device already taken?)\n");
		else
			std::print(stderr, "Reading from CUSE failed: {}", strerror(-rd));
		if (event_base_loopbreak(state->evb) == -1) {
			std::print(stderr, "event_base_loopbreak() failed, forcing hard exit");
			exit(1);
		}
	}
	fuse_session_process_buf(state->session, &state->buf);
}

static void steve_handle_recheck(evutil_socket_t, short, void *userdata) {
	steve_state *state = static_cast<steve_state *>(userdata);
	state->recheck_triggered.reset();
	steve_wake_waiters(state);
}

static void steve_handle_exit(evutil_socket_t signum, short, void *userdata) {
	steve_state *state = static_cast<steve_state *>(userdata);

	std::print(stderr, "Exiting on SIG{}\n", signal_name(signum));
	if (event_base_loopbreak(state->evb) == -1) {
		std::print(stderr, "event_base_loopbreak() failed, forcing hard exit");
		exit(1);
	}
}

static event_ptr steve_new_signal_handler(steve_state *state, int signum, event_callback_fn handler)
{
	event_ptr ret{evsignal_new(state->evb, signum, handler, state), event_free};
	if (!ret) {
		std::print(stderr, "failed to initialize SIG{} handler", signal_name(signum));
		return nullptr;
	}
	if (event_add(ret.get(), nullptr) == -1) {
		std::print(stderr, "failed to enable SIG{} handler", signal_name(signum));
		return nullptr;
	}
	return ret;
}

static bool steve_drop_privileges(const char *user) {
	errno = 0;
	struct passwd *pw = getpwnam(user);

	if (!pw) {
		if (errno != 0)
			perror("getpwnam() failed");
		else
			std::print(stderr, "user {} not found\n", user);
	} else if (chdir("/") == -1)
		perror("chdir('/') failed");
	else if (setgroups(0, nullptr) == -1)
		perror("setgroups() failed");
	else if (setgid(pw->pw_gid) == -1)
		perror("setgid() failed");
	else if (setuid(pw->pw_uid) == -1)
		perror("setuid() failed");
	else
		return true;

	return false;
}

static constexpr char steve_usage[] =
"usage: {} [options]\n"
"\n"
"options:\n"
"    --help, -h             print this help message\n"
"    --version, -V          print version\n"
"\n"
"    --jobs=JOBS, -j JOBS   jobs to use (default: nproc)\n"
"    --load-average=LOAD_AVG, -l LOAD_AVG\n"
"                           do not serve tokens unless load is below LOAD_AVG\n"
"    --load-recheck-timeout=TIMEOUT, -r TIMEOUT\n"
"                           timeout for throttling due to exceeded load, in sec\n"
"                           (fractional down to usec, default: 0.5)\n"
"    --min-memory-avail=MIN_MEM_AVAIL, -a MIN_MEM_AVAIL\n"
"                           do not serve tokens unless available memory\n"
"                           is above MIN_MEM_AVAIL, in MiB\n"
"    --min-jobs=MIN_JOBS, -m MIN_JOBS\n"
"                           min. jobs to serve even if load average is exceeded\n"
"                           (default: 1)\n"
"    --per-process-limit=LIMIT, -p LIMIT\n"
"                           max. jobs to serve to a single process\n"
"\n"
"    --dev-name=DEV_NAME    override the device name to use (default: steve)\n"
"    --user=USER, -u USER   drop superuser privileges and switch to USER\n"
"                           (and its primary group)\n"
"    --verbose, -v          enable verbose logging\n"
"    --debug, -d            enable FUSE debug output\n";

struct steve_long_option {
	static constexpr int dev_name = 256;
};

static const struct option steve_long_opts[] = {
	{ "help", no_argument, 0, 'h' },
	{ "version", no_argument, 0, 'V' },
	{ "jobs", required_argument, 0, 'j' },
	{ "load-average", required_argument, 0, 'l' },
	{ "load-recheck-timeout", required_argument, 0, 'r' },
	{ "min-memory-avail", required_argument, 0, 'a' },
	{ "min-jobs", required_argument, 0, 'm' },
	{ "per-process-limit", required_argument, 0, 'p' },
	{ "dev-name", required_argument, 0, steve_long_option::dev_name },
	{ "user", required_argument, 0, 'u' },
	{ "verbose", no_argument, 0, 'v' },
	{ "debug", no_argument, 0, 'd' },
	{},
};

static const char *steve_short_opts = "hVj:l:r:m:a:p:u:vd";

int main(int argc, char **argv)
{
	steve_state state{};

	int opt;
	bool debug = false;
	const char *user = nullptr;
	while ((opt = getopt_long(argc, argv, steve_short_opts, steve_long_opts, nullptr)) != -1) {
		switch (opt) {
			case 'h':
				std::print(steve_usage, argv[0]);
				return 0;
			case 'V':
				std::print("steve {}\n", STEVE_VERSION);
				return 0;
			case 'j':
			case 'm':
			case 'a':
			case 'p':
				{
					long jobs_arg;
					if (!arg_to_long(optarg, &jobs_arg)) {
						std::print(stderr, "invalid argument: {}\n", optarg);
						return 1;
					}
					if (opt == 'j')
						state.jobs = jobs_arg;
					else if (opt == 'm')
						state.min_jobs = jobs_arg;
					else if (opt == 'a')
						state.min_memory_avail = jobs_arg;
					else if (opt == 'p')
						state.per_process_limit = jobs_arg;
					else
						assert(0 && "not reached");
				}
				break;
			case 'l':
				if (!arg_to_double(optarg, &state.max_load_avg) || state.max_load_avg < 1) {
					std::print(stderr, "invalid load average value (must be >=1): {}\n", optarg);
					return 1;
				}
				break;
			case 'r': {
				double timeout;
				if (!arg_to_double(optarg, &timeout) || timeout < 0.000001 || timeout > INT_MAX) {
					std::print(stderr, "invalid timeout value (must be >=1 us): {}\n", optarg);
					return 1;
				}
				steve_timeout_to_timeval(&state.recheck_timeout, timeout);
				break;
			}
			case 'u':
				user = optarg;
				break;
			case 'v':
				state.verbose = true;
				break;
			case 'd':
				debug = true;
				break;
			case steve_long_option::dev_name:
				state.dev_name = optarg;
				break;
			default:
				std::print(stderr, steve_usage, argv[0]);
				return 1;
		}
	}

	if (argv[optind]) {
		std::print(stderr, "{}: unexpected positional arguments\n", argv[0]);
		std::print(stderr, steve_usage, argv[0]);
		return 1;
	}

	if (state.jobs == 0)
		state.jobs = sysconf(_SC_NPROCESSORS_ONLN);
	if (state.min_jobs > state.jobs) {
		std::print(stderr, "--min-jobs ({}) must be smaller than --jobs ({})\n",
				state.min_jobs, state.jobs);
		return 1;
	}
	if (state.min_memory_avail != -1) {
		auto memtotal = steve_get_meminfo(&state, "\nMemTotal:");
		if (!memtotal.has_value()) {
			std::print(stderr, "--min-memory-avail specified but memory info not available\n");
			return 1;
		}
		state.memory_total = memtotal.value() / 1024;
		if (state.memory_total < state.min_memory_avail) {
			std::print(stderr, "--min-memory-avail requests {} MiB available, but the system has only {} MiB memory\n",
					state.min_memory_avail, state.memory_total);
			return 1;
		}
	}

	std::unique_ptr<struct event_base, std::function<void(struct event_base*)>>
		evb{event_base_new(), event_base_free};
	if (!evb) {
		std::print(stderr, "failed to initialize libevent\n");
		return 1;
	}
	state.evb = evb.get();

	state.recheck_event.reset(
		evtimer_new(state.evb, steve_handle_recheck, &state)
	);
	if (!state.recheck_event) {
		std::print(stderr, "failed to initialize timer recheck event");
		return 1;
	}

	int cuse_fd = open("/dev/cuse", O_RDWR);
	if (cuse_fd == -1) {
		perror("unable to open /dev/cuse");
		return 1;
	}
	fd_guard cuse_fd_guard{cuse_fd};

	if (user && !steve_drop_privileges(user))
		return 1;

	std::string dev_name_arg{"DEVNAME="};
	dev_name_arg += state.dev_name;
	const char *dev_info_argv[] = { dev_name_arg.c_str() };
	struct cuse_info ci{};
	ci.dev_info_argc = 1;
	ci.dev_info_argv = dev_info_argv;

	struct fuse_args args = FUSE_ARGS_INIT(0, nullptr);
	std::unique_ptr<struct fuse_args, std::function<void(struct fuse_args*)>>
		args_ptr{&args, fuse_opt_free_args};
	fuse_opt_add_arg(args_ptr.get(), argv[0]);
	if (debug)
		fuse_opt_add_arg(args_ptr.get(), "-d");

	std::unique_ptr<struct fuse_session, std::function<void(struct fuse_session*)>> session{
		cuse_lowlevel_new(args_ptr.get(), &ci, &steve_ops, &state), fuse_session_destroy};
	if (!session) {
		std::print(stderr, "failed to initialize FUSE");
		return 1;
	}
	state.session = session.get();

	event_ptr cuse_event{event_new(evb.get(), cuse_fd, EV_READ|EV_PERSIST, steve_handle_cuse, &state), event_free};
	if (!cuse_event) {
		std::print(stderr, "failed to initialize CUSE handler");
		return 1;
	}
	if (event_add(cuse_event.get(), nullptr) == -1) {
		std::print(stderr, "failed to enable CUSE handler");
		return 1;
	}

	event_ptr sigusr1_event = steve_new_signal_handler(&state, SIGUSR1, steve_handle_sigusr1);
	std::vector<event_ptr> term_signal_handlers;
	for (int signum : {SIGHUP, SIGINT, SIGTERM})
		term_signal_handlers.push_back(
				steve_new_signal_handler(&state, signum, steve_handle_exit));

	std::string mountpoint = std::format("/dev/fd/{}", cuse_fd);
	if (fuse_session_mount(session.get(), mountpoint.c_str()) == -1) {
		std::print(stderr, "failed to mount the filesystem");
		return 1;
	}

	event_base_dispatch(evb.get());
	fuse_session_unmount(session.get());
	return state.retval;
}
