You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
488 lines
11 KiB
488 lines
11 KiB
3 years ago
|
#include <linux/export.h>
|
||
|
#include <linux/compiler.h>
|
||
|
#include <linux/dax.h>
|
||
|
#include <linux/fs.h>
|
||
|
#include <linux/sched/signal.h>
|
||
|
#include <linux/uaccess.h>
|
||
|
#include <linux/capability.h>
|
||
|
#include <linux/kernel_stat.h>
|
||
|
#include <linux/gfp.h>
|
||
|
#include <linux/mm.h>
|
||
|
#include <linux/swap.h>
|
||
|
#include <linux/mman.h>
|
||
|
#include <linux/pagemap.h>
|
||
|
#include <linux/file.h>
|
||
|
#include <linux/uio.h>
|
||
|
#include <linux/hash.h>
|
||
|
#include <linux/writeback.h>
|
||
|
#include <linux/backing-dev.h>
|
||
|
#include <linux/pagevec.h>
|
||
|
#include <linux/blkdev.h>
|
||
|
#include <linux/security.h>
|
||
|
#include <linux/cpuset.h>
|
||
|
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
|
||
|
#include <linux/hugetlb.h>
|
||
|
#include <linux/memcontrol.h>
|
||
|
#include <linux/cleancache.h>
|
||
|
#include <linux/rmap.h>
|
||
|
#include <linux/module.h>
|
||
|
#include <linux/io_record.h>
|
||
|
#include "internal.h"
|
||
|
|
||
|
struct io_info {
|
||
|
struct file *file;
|
||
|
struct inode *inode;
|
||
|
int offset;
|
||
|
int nr_pages;
|
||
|
};
|
||
|
|
||
|
#define NUM_IO_INFO_IN_BUF (128 * 1024) /* # of struct io_info */
|
||
|
#define RESULT_BUF_SIZE_IN_BYTES (5 * 1024 * 1024) /* 5MB */
|
||
|
#define RESULT_BUF_END_MAGIC (~0) /* -1 */
|
||
|
|
||
|
struct io_info *record_buf; /* array of struct io_info */
|
||
|
void *result_buf; /* buffer used for post processing result */
|
||
|
|
||
|
/*
|
||
|
* format in result buf per file:
|
||
|
* <A = length of "path", (size = sizeof(int))>
|
||
|
* <"path" string, (size = A)>
|
||
|
* <tuple array, (size = B * sizeof(int) * 2>
|
||
|
* <end MAGIC, (val = -1, size = sizeof(int) * 2>
|
||
|
*/
|
||
|
#define MAX_FILEPATH_LEN 256
|
||
|
|
||
|
/* return bytes written to the path. if buffer full, return < 0 */
|
||
|
void *result_buf_cursor; /* this is touched by post processing only */
|
||
|
void write_to_result_buf(void *src, int size)
|
||
|
{
|
||
|
memcpy(result_buf_cursor, src, size);
|
||
|
result_buf_cursor = result_buf_cursor + size;
|
||
|
}
|
||
|
|
||
|
/* this assumes that start_idx~end_idx belong to the same inode */
|
||
|
int fill_result_buf(int start_idx, int end_idx)
|
||
|
{
|
||
|
int ret = 0;
|
||
|
int i;
|
||
|
int size_expected;
|
||
|
char strbuf[MAX_FILEPATH_LEN];
|
||
|
char *path;
|
||
|
int pathsize;
|
||
|
int result_buf_used;
|
||
|
int prev_offset = -1;
|
||
|
int long max_size = 0;
|
||
|
void *buf_start;
|
||
|
struct file *file;
|
||
|
|
||
|
if (start_idx >= end_idx)
|
||
|
BUG_ON(1); /* this case is not in consideration */
|
||
|
|
||
|
file = record_buf[start_idx].file;
|
||
|
path = d_path(&file->f_path, strbuf, MAX_FILEPATH_LEN);
|
||
|
if (!path || IS_ERR(path))
|
||
|
goto out;
|
||
|
|
||
|
/* max size check (not strict) */
|
||
|
result_buf_used = result_buf_cursor - result_buf;
|
||
|
size_expected = sizeof(int) * 2 + /* end magic of this attempt */
|
||
|
sizeof(int) + strlen(path) + /* for path string */
|
||
|
sizeof(int) * 2 * (end_idx - start_idx) + /* data */
|
||
|
sizeof(int); /* end magic of post-processing */
|
||
|
if (size_expected > (RESULT_BUF_SIZE_IN_BYTES - result_buf_used))
|
||
|
return -EINVAL;
|
||
|
|
||
|
buf_start = result_buf_cursor;
|
||
|
pathsize = strlen(path);
|
||
|
write_to_result_buf(&pathsize, sizeof(int));
|
||
|
write_to_result_buf(path, pathsize);
|
||
|
|
||
|
/* fill the result buf using the record buf */
|
||
|
for (i = start_idx; i < end_idx; i++) {
|
||
|
if (prev_offset == -1) {
|
||
|
prev_offset = record_buf[i].offset;
|
||
|
max_size = record_buf[i].nr_pages;
|
||
|
continue;
|
||
|
}
|
||
|
/* in the last range */
|
||
|
if ((prev_offset + max_size) >=
|
||
|
(record_buf[i].offset + record_buf[i].nr_pages)) {
|
||
|
continue;
|
||
|
} else {
|
||
|
if ((prev_offset + max_size) >= record_buf[i].offset) {
|
||
|
max_size = (record_buf[i].offset +
|
||
|
record_buf[i].nr_pages) -
|
||
|
prev_offset;
|
||
|
} else {
|
||
|
write_to_result_buf(&prev_offset,
|
||
|
sizeof(int));
|
||
|
write_to_result_buf(&max_size,
|
||
|
sizeof(int));
|
||
|
prev_offset = record_buf[i].offset;
|
||
|
max_size = record_buf[i].nr_pages;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
/* fill the record buf */
|
||
|
write_to_result_buf(&prev_offset, sizeof(int));
|
||
|
write_to_result_buf(&max_size, sizeof(int));
|
||
|
|
||
|
/* fill the record buf with final magic */
|
||
|
prev_offset = RESULT_BUF_END_MAGIC;
|
||
|
max_size = RESULT_BUF_END_MAGIC;
|
||
|
write_to_result_buf(&prev_offset, sizeof(int));
|
||
|
write_to_result_buf(&max_size, sizeof(int));
|
||
|
|
||
|
/* return # of bytes written to result buf */
|
||
|
ret = result_buf_cursor - buf_start;
|
||
|
out:
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
/* idx record_buf_cursor of the array (record_buf) */
|
||
|
static atomic_t record_buf_cursor = ATOMIC_INIT(-1);
|
||
|
|
||
|
static DEFINE_RWLOCK(record_rwlock);
|
||
|
int record_target; /* pid # of group leader */
|
||
|
bool record_enable;
|
||
|
|
||
|
static DEFINE_MUTEX(status_lock);
|
||
|
enum io_record_cmd_types current_status = IO_RECORD_INIT;
|
||
|
|
||
|
static inline void set_record_status(bool enable)
|
||
|
{
|
||
|
write_lock(&record_rwlock);
|
||
|
record_enable = enable;
|
||
|
write_unlock(&record_rwlock);
|
||
|
}
|
||
|
|
||
|
/* assume caller has read lock of record_rwlock */
|
||
|
static inline bool __get_record_status(void)
|
||
|
{
|
||
|
return record_enable;
|
||
|
}
|
||
|
|
||
|
static inline void set_record_target(int pid)
|
||
|
{
|
||
|
write_lock(&record_rwlock);
|
||
|
record_target = pid;
|
||
|
write_unlock(&record_rwlock);
|
||
|
}
|
||
|
|
||
|
void release_records(void);
|
||
|
|
||
|
/* change the current status, and do the init jobs for the status */
|
||
|
static void change_current_status(enum io_record_cmd_types status)
|
||
|
{
|
||
|
switch (status) {
|
||
|
case IO_RECORD_INIT:
|
||
|
set_record_status(false);
|
||
|
set_record_target(-1);
|
||
|
release_records();
|
||
|
atomic_set(&record_buf_cursor, 0);
|
||
|
result_buf_cursor = result_buf;
|
||
|
break;
|
||
|
case IO_RECORD_START:
|
||
|
set_record_status(true);
|
||
|
break;
|
||
|
case IO_RECORD_STOP:
|
||
|
set_record_status(false);
|
||
|
break;
|
||
|
case IO_RECORD_POST_PROCESSING:
|
||
|
break;
|
||
|
case IO_RECORD_POST_PROCESSING_DONE:
|
||
|
break;
|
||
|
}
|
||
|
current_status = status;
|
||
|
}
|
||
|
|
||
|
/* Only this function contains the status change rules */
|
||
|
/* Assume that the caller has the status lock */
|
||
|
static inline bool change_status_if_valid(enum io_record_cmd_types next_status)
|
||
|
{
|
||
|
bool ret = false;
|
||
|
|
||
|
if (!record_buf)
|
||
|
return false;
|
||
|
|
||
|
if (next_status == IO_RECORD_INIT &&
|
||
|
current_status != IO_RECORD_POST_PROCESSING)
|
||
|
ret = true;
|
||
|
else if (next_status == (current_status + 1))
|
||
|
ret = true;
|
||
|
if (ret)
|
||
|
change_current_status(next_status);
|
||
|
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* control :
|
||
|
* - record start
|
||
|
* - record end
|
||
|
* - data read
|
||
|
* hook :
|
||
|
* - syscall, or pagecache add or ...
|
||
|
* - filemap fault
|
||
|
* buffer : (to store records between record start and record end)
|
||
|
* post-processing : merge, etc.
|
||
|
* output : return post processing result to userspace...
|
||
|
*/
|
||
|
|
||
|
/* return 0 on success */
|
||
|
bool start_record(int pid)
|
||
|
{
|
||
|
bool ret = false;
|
||
|
|
||
|
mutex_lock(&status_lock);
|
||
|
if (!change_status_if_valid(IO_RECORD_START))
|
||
|
goto out;
|
||
|
|
||
|
set_record_target(pid);
|
||
|
|
||
|
ret = true;
|
||
|
out:
|
||
|
mutex_unlock(&status_lock);
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
bool stop_record(void)
|
||
|
{
|
||
|
bool ret = false;
|
||
|
|
||
|
mutex_lock(&status_lock);
|
||
|
if (!change_status_if_valid(IO_RECORD_STOP))
|
||
|
goto out;
|
||
|
|
||
|
ret = true;
|
||
|
out:
|
||
|
mutex_unlock(&status_lock);
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
#include <linux/sort.h>
|
||
|
|
||
|
static void io_info_swap(void *lhs, void *rhs, int size)
|
||
|
{
|
||
|
struct io_info tmp;
|
||
|
struct io_info *linfo = (struct io_info *)lhs;
|
||
|
struct io_info *rinfo = (struct io_info *)rhs;
|
||
|
|
||
|
memcpy(&tmp, linfo, sizeof(struct io_info));
|
||
|
memcpy(linfo, rinfo, sizeof(struct io_info));
|
||
|
memcpy(rinfo, &tmp, sizeof(struct io_info));
|
||
|
}
|
||
|
|
||
|
static int io_info_compare(const void *lhs, const void *rhs)
|
||
|
{
|
||
|
struct io_info *linfo = (struct io_info *)lhs;
|
||
|
struct io_info *rinfo = (struct io_info *)rhs;
|
||
|
|
||
|
if ((unsigned long)linfo->inode > (unsigned long)rinfo->inode)
|
||
|
return 1;
|
||
|
else if ((unsigned long)linfo->inode < (unsigned long)rinfo->inode)
|
||
|
return -1;
|
||
|
else {
|
||
|
if ((unsigned long)linfo->offset > (unsigned long)rinfo->offset)
|
||
|
return 1;
|
||
|
else if ((unsigned long)linfo->offset <
|
||
|
(unsigned long)rinfo->offset)
|
||
|
return -1;
|
||
|
else
|
||
|
return 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
bool post_processing_records(void)
|
||
|
{
|
||
|
bool ret = false;
|
||
|
int i;
|
||
|
struct inode *prev_inode = NULL;
|
||
|
int start_idx = -1, end_idx = -1;
|
||
|
int last_magic = RESULT_BUF_END_MAGIC;
|
||
|
|
||
|
mutex_lock(&status_lock);
|
||
|
if (!change_status_if_valid(IO_RECORD_POST_PROCESSING))
|
||
|
goto out;
|
||
|
|
||
|
/* From this point, we assume that no one touches record buf */
|
||
|
/* sort based on inode pointer address */
|
||
|
sort(record_buf, atomic_read(&record_buf_cursor),
|
||
|
sizeof(struct io_info), &io_info_compare, &io_info_swap);
|
||
|
|
||
|
/* fill the result buf per inode */
|
||
|
for (i = 0; i < atomic_read(&record_buf_cursor); i++) {
|
||
|
if (prev_inode != record_buf[i].inode) {
|
||
|
end_idx = i;
|
||
|
if (prev_inode && (fill_result_buf(start_idx,
|
||
|
end_idx) < 0))
|
||
|
/* if result buf full, break without write */
|
||
|
break;
|
||
|
prev_inode = record_buf[i].inode;
|
||
|
start_idx = i;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (start_idx != -1)
|
||
|
fill_result_buf(start_idx, i);
|
||
|
|
||
|
/* fill the last magic to indicate end of result */
|
||
|
write_to_result_buf(&last_magic, sizeof(int));
|
||
|
|
||
|
if (!change_status_if_valid(IO_RECORD_POST_PROCESSING_DONE))
|
||
|
BUG_ON(1); /* this is the case not in consideration */
|
||
|
|
||
|
ret = true;
|
||
|
out:
|
||
|
mutex_unlock(&status_lock);
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
ssize_t read_record(char __user *buf, size_t count, loff_t *ppos)
|
||
|
{
|
||
|
int result_buf_size;
|
||
|
int ret;
|
||
|
|
||
|
mutex_lock(&status_lock);
|
||
|
if (current_status != IO_RECORD_POST_PROCESSING_DONE) {
|
||
|
ret = -EFAULT;
|
||
|
goto out;
|
||
|
}
|
||
|
|
||
|
result_buf_size = result_buf_cursor - result_buf;
|
||
|
if (*ppos >= result_buf_size) {
|
||
|
ret = 0;
|
||
|
goto out;
|
||
|
}
|
||
|
|
||
|
ret = (*ppos + count < result_buf_size) ? count :
|
||
|
(result_buf_size - *ppos);
|
||
|
if (copy_to_user(buf, result_buf + *ppos, ret)) {
|
||
|
ret = -EFAULT;
|
||
|
goto out;
|
||
|
}
|
||
|
|
||
|
*ppos = *ppos + ret;
|
||
|
out:
|
||
|
mutex_unlock(&status_lock);
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* if this is not called explicitly by user processes, kernel should call this
|
||
|
* at some point.
|
||
|
*/
|
||
|
bool init_record(void)
|
||
|
{
|
||
|
bool ret = false;
|
||
|
|
||
|
mutex_lock(&status_lock);
|
||
|
if (!change_status_if_valid(IO_RECORD_INIT))
|
||
|
goto out;
|
||
|
ret = true;
|
||
|
out:
|
||
|
mutex_unlock(&status_lock);
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
bool forced_init_record(void)
|
||
|
{
|
||
|
bool ret = false;
|
||
|
int loopnum = 0;
|
||
|
|
||
|
if (!record_buf)
|
||
|
goto out;
|
||
|
retry:
|
||
|
loopnum++;
|
||
|
ret = init_record();
|
||
|
if (!ret)
|
||
|
goto retry;
|
||
|
|
||
|
if (loopnum > 1)
|
||
|
pr_err("%s,%d: loopnum %d\n", __func__, __LINE__, loopnum);
|
||
|
ret = true;
|
||
|
out:
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
void record_io_info(struct file *file, pgoff_t offset,
|
||
|
unsigned long req_size)
|
||
|
{
|
||
|
struct io_info *info;
|
||
|
int cnt;
|
||
|
|
||
|
/* check without lock */
|
||
|
if ((int)task_tgid_nr(current) != record_target)
|
||
|
return;
|
||
|
|
||
|
if (offset >= INT_MAX || req_size >= INT_MAX)
|
||
|
return;
|
||
|
|
||
|
cnt = atomic_read(&record_buf_cursor);
|
||
|
if (cnt < 0 || cnt >= NUM_IO_INFO_IN_BUF || !file || req_size == 0)
|
||
|
return;
|
||
|
|
||
|
if (!read_trylock(&record_rwlock))
|
||
|
return;
|
||
|
|
||
|
if (!__get_record_status())
|
||
|
goto out;
|
||
|
|
||
|
/* strict check */
|
||
|
if ((int)task_tgid_nr(current) != record_target)
|
||
|
goto out;
|
||
|
|
||
|
cnt = atomic_inc_return(&record_buf_cursor) - 1;
|
||
|
|
||
|
/* buffer is full */
|
||
|
if (cnt >= NUM_IO_INFO_IN_BUF) {
|
||
|
atomic_dec(&record_buf_cursor);
|
||
|
goto out;
|
||
|
}
|
||
|
|
||
|
info = record_buf + cnt;
|
||
|
|
||
|
get_file(file); /* will be put in release_records */
|
||
|
info->file = file;
|
||
|
info->inode = file_inode(file);
|
||
|
info->offset = (int)offset;
|
||
|
info->nr_pages = (int)req_size;
|
||
|
out:
|
||
|
read_unlock(&record_rwlock);
|
||
|
}
|
||
|
|
||
|
void release_records(void)
|
||
|
{
|
||
|
int i;
|
||
|
struct io_info *info;
|
||
|
|
||
|
for (i = 0; i < atomic_read(&record_buf_cursor); i++) {
|
||
|
info = record_buf + i;
|
||
|
fput(info->file);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static int __init io_record_init(void)
|
||
|
{
|
||
|
record_buf = vzalloc(sizeof(struct io_info) * NUM_IO_INFO_IN_BUF);
|
||
|
if (!record_buf)
|
||
|
goto record_buf_fail;
|
||
|
|
||
|
result_buf = vzalloc(RESULT_BUF_SIZE_IN_BYTES);
|
||
|
if (!result_buf)
|
||
|
goto result_buf_fail;
|
||
|
|
||
|
mutex_lock(&status_lock);
|
||
|
if (!change_status_if_valid(IO_RECORD_INIT))
|
||
|
BUG_ON(1); /* should success at boot time */
|
||
|
|
||
|
mutex_unlock(&status_lock);
|
||
|
return 0;
|
||
|
result_buf_fail:
|
||
|
vfree(record_buf);
|
||
|
record_buf_fail:
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
module_init(io_record_init);
|