From 4621bd773bd109e71736b1a472559f3f19b89a04 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Tue, 1 Jan 2019 21:33:11 +0800 Subject: [PATCH 001/134] f2fs: change error code to -ENOMEM from -EINVAL The error case of failing allocating memory should return -ENOMEM. Signed-off-by: Chengguang Xu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 76a791f841ff..3c4dcf381c3b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -269,7 +269,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, if (!qname) { f2fs_msg(sb, KERN_ERR, "Not enough memory for storing quotafile name"); - return -EINVAL; + return -ENOMEM; } if (F2FS_OPTION(sbi).s_qf_names[qtype]) { if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0) From 52b24f802a8159faa1c284562f0e26d3e1d264c8 Mon Sep 17 00:00:00 2001 From: Zhikang Zhang Date: Thu, 3 Jan 2019 20:06:38 +0800 Subject: [PATCH 002/134] f2fs: fix compile warnings: 'struct *' declared inside parameter list We meet these compile warnings below, which caused by missing declare structs: struct f2fs_io_info, struct extent, struct f2fs_sb_info. warning: 'struct f2fs_io_info' declared inside parameter list warning: 'struct extent_info' declared inside parameter list warning: 'struct f2fs_sb_info' declared inside parameter list Signed-off-by: Zhikang Zhang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/trace/events/f2fs.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 06c87f9f720c..c842bc70c68c 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -150,6 +150,9 @@ TRACE_DEFINE_ENUM(CP_TRIMMED); { CP_SPEC_LOG_NUM, "log type is 2" }, \ { CP_RECOVER_DIR, "dir needs recovery" }) +struct f2fs_sb_info; +struct f2fs_io_info; +struct extent_info; struct victim_sel_policy; struct f2fs_map_blocks; From 63385cccb48b597867afe022a1ed29179d65ab58 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 4 Jan 2019 01:38:29 +0000 Subject: [PATCH 003/134] f2fs: remove set but not used variable 'err' Fixes gcc '-Wunused-but-set-variable' warning: fs/f2fs/data.c: In function 'f2fs_dio_submit_bio': fs/f2fs/data.c:2585:6: warning: variable 'err' set but not used [-Wunused-but-set-variable] Signed-off-by: YueHaibing Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f09187c3d962..c0ee1c06e70e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2581,14 +2581,11 @@ static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode, { struct f2fs_private_dio *dio; bool write = (bio_op(bio) == REQ_OP_WRITE); - int err; dio = f2fs_kzalloc(F2FS_I_SB(inode), sizeof(struct f2fs_private_dio), GFP_NOFS); - if (!dio) { - err = -ENOMEM; + if (!dio) goto out; - } dio->inode = inode; dio->orig_end_io = bio->bi_end_io; From 43d60e5b8680e8969c7f168c8604870177542280 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 4 Jan 2019 17:39:53 +0800 Subject: [PATCH 004/134] f2fs: check inject_rate validity during configuring Type of inject_rate is unsigned int, let's check new value's validity during configuring. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 948c1a211341..b427122cb71a 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -222,6 +222,8 @@ out: #ifdef CONFIG_F2FS_FAULT_INJECTION if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) return -EINVAL; + if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX) + return -EINVAL; #endif if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); From 9df86e4e8b9bd2392d97a035b9ed8c79e700cf6f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Jan 2019 17:19:08 -0800 Subject: [PATCH 005/134] f2fs: export FS_NOCOW_FL flag to user This exports pin_file status to user. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/file.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ecaa01aaf20e..8cd739b51436 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2302,11 +2302,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) #define F2FS_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define F2FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define F2FS_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define F2FS_NOCOW_FL 0x00800000 /* Do not cow file */ #define F2FS_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define F2FS_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define F2FS_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ +#define F2FS_FL_USER_VISIBLE 0x30CBDFFF /* User visible flags */ #define F2FS_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */ /* Flags we can manipulate with through F2FS_IOC_FSSETXATTR */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index dfd26475c582..2a06e0f19a9a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1651,6 +1651,8 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) flags |= F2FS_ENCRYPT_FL; if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) flags |= F2FS_INLINE_DATA_FL; + if (is_inode_flag_set(inode, FI_PIN_FILE)) + flags |= F2FS_NOCOW_FL; flags &= F2FS_FL_USER_VISIBLE; From 537bb5c403d38f8947c6b1e08340994ba446d56a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 4 Jan 2019 14:26:18 +0100 Subject: [PATCH 006/134] f2fs: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Cc: Jaegeuk Kim Cc: Chao Yu Cc: linux-f2fs-devel@lists.sourceforge.net Signed-off-by: Greg Kroah-Hartman Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 20 +++----------------- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/super.c | 5 +---- 3 files changed, 6 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index f05b37ef7182..d00ba9b711a4 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -522,30 +522,16 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) kvfree(si); } -int __init f2fs_create_root_stats(void) +void __init f2fs_create_root_stats(void) { - struct dentry *file; - f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); - if (!f2fs_debugfs_root) - return -ENOMEM; - file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, - NULL, &stat_fops); - if (!file) { - debugfs_remove(f2fs_debugfs_root); - f2fs_debugfs_root = NULL; - return -ENOMEM; - } - - return 0; + debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, + &stat_fops); } void f2fs_destroy_root_stats(void) { - if (!f2fs_debugfs_root) - return; - debugfs_remove_recursive(f2fs_debugfs_root); f2fs_debugfs_root = NULL; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8cd739b51436..fc6438a40a93 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3330,7 +3330,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) int f2fs_build_stats(struct f2fs_sb_info *sbi); void f2fs_destroy_stats(struct f2fs_sb_info *sbi); -int __init f2fs_create_root_stats(void); +void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else #define stat_inc_cp_count(si) do { } while (0) @@ -3368,7 +3368,7 @@ void f2fs_destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline int __init f2fs_create_root_stats(void) { return 0; } +static inline void __init f2fs_create_root_stats(void) { } static inline void f2fs_destroy_root_stats(void) { } #endif diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3c4dcf381c3b..141cac14fc67 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3549,9 +3549,7 @@ static int __init init_f2fs_fs(void) err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; - err = f2fs_create_root_stats(); - if (err) - goto free_filesystem; + f2fs_create_root_stats(); err = f2fs_init_post_read_processing(); if (err) goto free_root_stats; @@ -3559,7 +3557,6 @@ static int __init init_f2fs_fs(void) free_root_stats: f2fs_destroy_root_stats(); -free_filesystem: unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); From 8faba5de867b358d5a9136e732df7803945e418d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 8 Jan 2019 10:21:24 +0800 Subject: [PATCH 007/134] f2fs: fix to trigger fsck if dirent.name_len is zero While traversing dirents in f2fs_fill_dentries(), if bitmap is valid, filename length should not be zero, otherwise, directory structure consistency could be corrupted, in this case, let's print related info and set SBI_NEED_FSCK to trigger fsck for repairing. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7ff9e993008e..8c1457a7e187 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -800,6 +800,10 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (de->name_len == 0) { bit_pos++; ctx->pos = start_pos + bit_pos; + printk_ratelimited( + "%s, invalid namelen(0), ino:%u, run fsck to fix.", + KERN_WARNING, le32_to_cpu(de->ino)); + set_sbi_flag(sbi, SBI_NEED_FSCK); continue; } From 4951bf45e057b891d79a6438e5302b2c9f685673 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Mon, 7 Jan 2019 15:02:34 +0800 Subject: [PATCH 008/134] f2fs: check if file namelen exceeds max value Dentry bitmap is not enough to detect incorrect dentries. So this patch also checks the namelen value of a dentry. Signed-off-by: Gong Chen Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 8c1457a7e187..0e0e0880c4de 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -814,7 +814,8 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, /* check memory boundary before moving forward */ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - if (unlikely(bit_pos > d->max)) { + if (unlikely(bit_pos > d->max || + le16_to_cpu(de->name_len) > F2FS_NAME_LEN)) { f2fs_msg(sbi->sb, KERN_WARNING, "%s: corrupted namelen=%d, run fsck to fix.", __func__, le16_to_cpu(de->name_len)); From ddb9a293f5e4217e373adfc11e98dfe004a6fced Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Mon, 14 Jan 2019 22:05:14 +0800 Subject: [PATCH 009/134] f2fs: add brackets for macros Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fc6438a40a93..c62b905b0626 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -255,7 +255,7 @@ struct discard_entry { /* max discard pend list number */ #define MAX_PLIST_NUM 512 #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ - (MAX_PLIST_NUM - 1) : (blk_num - 1)) + (MAX_PLIST_NUM - 1) : ((blk_num) - 1)) enum { D_PREP, /* initial */ @@ -2764,9 +2764,9 @@ static inline int get_inline_xattr_addrs(struct inode *inode) #define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) #define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ - ((offsetof(typeof(*f2fs_inode), field) + \ + ((offsetof(typeof(*(f2fs_inode)), field) + \ sizeof((f2fs_inode)->field)) \ - <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \ static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) { @@ -2795,8 +2795,8 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, #define __is_large_section(sbi) ((sbi)->segs_per_sec > 1) -#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \ - (!is_read_io(fio->op) || fio->is_meta)) +#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META && \ + (!is_read_io((fio)->op) || (fio)->is_meta)) bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); From a6d7095630e77f6e7485589a20ade0b59b39aed2 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Tue, 15 Jan 2019 20:02:15 +0000 Subject: [PATCH 010/134] f2fs: UBSAN: set boolean value iostat_enable correctly When setting /sys/fs/f2fs//iostat_enable with non-bool value, UBSAN reports the following warning. [ 7562.295484] ================================================================================ [ 7562.296531] UBSAN: Undefined behaviour in fs/f2fs/f2fs.h:2776:10 [ 7562.297651] load of value 64 is not a valid value for type '_Bool' [ 7562.298642] CPU: 1 PID: 7487 Comm: dd Not tainted 4.20.0-rc4+ #79 [ 7562.298653] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 7562.298662] Call Trace: [ 7562.298760] dump_stack+0x46/0x5b [ 7562.298811] ubsan_epilogue+0x9/0x40 [ 7562.298830] __ubsan_handle_load_invalid_value+0x72/0x90 [ 7562.298863] f2fs_file_write_iter+0x29f/0x3f0 [ 7562.298905] __vfs_write+0x115/0x160 [ 7562.298922] vfs_write+0xa7/0x190 [ 7562.298934] ksys_write+0x50/0xc0 [ 7562.298973] do_syscall_64+0x4a/0xe0 [ 7562.298992] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 7562.299001] RIP: 0033:0x7fa45ec19c00 [ 7562.299004] Code: 73 01 c3 48 8b 0d 88 92 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d dd eb 2c 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ce 8f 01 00 48 89 04 24 [ 7562.299044] RSP: 002b:00007ffca52b49e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 7562.299052] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fa45ec19c00 [ 7562.299059] RDX: 0000000000000400 RSI: 000000000093f000 RDI: 0000000000000001 [ 7562.299065] RBP: 000000000093f000 R08: 0000000000000004 R09: 0000000000000000 [ 7562.299071] R10: 00007ffca52b47b0 R11: 0000000000000246 R12: 0000000000000400 [ 7562.299077] R13: 000000000093f000 R14: 000000000093f400 R15: 0000000000000000 [ 7562.299091] ================================================================================ So, if iostat_enable is enabled, set its value as true. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index b427122cb71a..a609c0abbdd0 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -280,10 +280,16 @@ out: return count; } - *ui = t; - if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) - f2fs_reset_iostat(sbi); + if (!strcmp(a->attr.name, "iostat_enable")) { + sbi->iostat_enable = !!t; + if (!sbi->iostat_enable) + f2fs_reset_iostat(sbi); + return count; + } + + *ui = (unsigned int)t; + return count; } From 1c3c592eaa5344727787b76322fc9500d696fdaf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 16 Jan 2019 09:51:28 +0800 Subject: [PATCH 011/134] f2fs: fix to initialize variable to avoid UBSAN/smatch warning As Dan Carpenter as below: The patch df634f444ee9: "f2fs: use rb_*_cached friends" from Oct 4, 2018, leads to the following static checker warning: fs/f2fs/extent_cache.c:606 f2fs_update_extent_tree_range() error: uninitialized symbol 'leftmost'. And also Eric Biggers, and Kyungtae Kim reported, there is an UBSAN warning described as below: We report a bug in linux-4.20.2: "UBSAN: Undefined behaviour in fs/f2fs/extent_cache.c" kernel config: https://kt0755.github.io/etc/config_v4.20_stable repro: https://kt0755.github.io/etc/repro.4a3e7.c (f2fs is mounted on /mnt/f2fs/) This arose in f2fs_update_extent_tree_range (fs/f2fs/extent_cache.c:605). It seems that, for some reason, its last argument became "24" although that was supposed to be bool type. ========================================= UBSAN: Undefined behaviour in fs/f2fs/extent_cache.c:605:4 load of value 24 is not a valid value for type '_Bool' CPU: 0 PID: 6774 Comm: syz-executor5 Not tainted 4.20.2 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xb1/0x118 lib/dump_stack.c:113 ubsan_epilogue+0x12/0x94 lib/ubsan.c:159 __ubsan_handle_load_invalid_value+0x17a/0x1be lib/ubsan.c:457 f2fs_update_extent_tree_range+0x1d4a/0x1d50 fs/f2fs/extent_cache.c:605 f2fs_update_extent_cache+0x2b6/0x350 fs/f2fs/extent_cache.c:804 f2fs_update_data_blkaddr+0x61/0x70 fs/f2fs/data.c:656 f2fs_outplace_write_data+0x1d6/0x4b0 fs/f2fs/segment.c:3140 f2fs_convert_inline_page+0x86d/0x2060 fs/f2fs/inline.c:163 f2fs_convert_inline_inode+0x6b5/0xad0 fs/f2fs/inline.c:208 f2fs_preallocate_blocks+0x78b/0xb00 fs/f2fs/data.c:982 f2fs_file_write_iter+0x31b/0xf40 fs/f2fs/file.c:3062 call_write_iter include/linux/fs.h:1857 [inline] new_sync_write fs/read_write.c:474 [inline] __vfs_write+0x538/0x6e0 fs/read_write.c:487 vfs_write+0x1b3/0x520 fs/read_write.c:549 ksys_write+0xde/0x1c0 fs/read_write.c:598 __do_sys_write fs/read_write.c:610 [inline] __se_sys_write fs/read_write.c:607 [inline] __x64_sys_write+0x7e/0xc0 fs/read_write.c:607 do_syscall_64+0xbe/0x4f0 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 Code: e8 8c 9f 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 9b 6b fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f1ea15edc68 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00007f1ea15ee6cc RCX: 00000000004497b9 RDX: 0000000000001000 RSI: 0000000020000140 RDI: 0000000000000013 RBP: 000000000071bea0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 000000000000bb50 R14: 00000000006f4bf0 R15: 00007f1ea15ee700 ========================================= As I checked, this uninitialized variable won't cause extent cache corruption, but in order to avoid such kind of warning of both UBSAN and smatch, fix to initialize related variable. Reported-by: Dan Carpenter Reported-by: Eric Biggers Reported-by: Kyungtae Kim Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 1cb0fcc67d2d..caf77fe8ac07 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -506,7 +506,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, unsigned int end = fofs + len; unsigned int pos = (unsigned int)fofs; bool updated = false; - bool leftmost; + bool leftmost = false; if (!et) return; From d09408e2f47cad1970e3e6d61808209719dc67a8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 10 Jan 2019 16:40:12 +0800 Subject: [PATCH 012/134] f2fs: fix to set sbi dirty correctly In order to record direct IO count, we add two additional type in enum count_type: F2FS_DIO_{WRITE,READ}, but those IO won't dirty filesystem metadata, so we don't need to set filesystem dirty in inc_page_count(), fix it. Fixes: 02b16d0a34a1 ("f2fs: add to account direct IO") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c62b905b0626..b822ac3cb265 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1800,13 +1800,12 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_inc(&sbi->nr_pages[count_type]); - if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || - count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA || - count_type == F2FS_RD_DATA || count_type == F2FS_RD_NODE || - count_type == F2FS_RD_META) - return; - - set_sbi_flag(sbi, SBI_IS_DIRTY); + if (count_type == F2FS_DIRTY_DENTS || + count_type == F2FS_DIRTY_NODES || + count_type == F2FS_DIRTY_META || + count_type == F2FS_DIRTY_QDATA || + count_type == F2FS_DIRTY_IMETA) + set_sbi_flag(sbi, SBI_IS_DIRTY); } static inline void inode_inc_dirty_pages(struct inode *inode) From a0172a986cb1ebcf496cae7cbf9a25ed3c5eb439 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 14 Jan 2019 10:42:11 -0800 Subject: [PATCH 013/134] f2fs: run discard jobs when put_super When we umount f2fs, we need to avoid long delay due to discard commands, which is actually taking tens of seconds, if storage is very slow on UNMAP. So, this patch introduces timeout-based work on it. By default, let me give 5 seconds for discard. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 7 +++++++ fs/f2fs/f2fs.h | 5 ++++- fs/f2fs/segment.c | 11 ++++++++++- fs/f2fs/super.c | 4 +++- fs/f2fs/sysfs.c | 3 +++ 5 files changed, 27 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 381ab9fed3e6..a6fe7368b26c 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -86,6 +86,13 @@ Description: The unit size is one block, now only support configuring in range of [1, 512]. +What: /sys/fs/f2fs//umount_discard_timeout +Date: January 2019 +Contact: "Jaegeuk Kim" +Description: + Set timeout to issue discard commands during umount. + Default: 5 secs + What: /sys/fs/f2fs//max_victim_search Date: January 2014 Contact: "Jaegeuk Kim" diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b822ac3cb265..417585b0a494 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -192,6 +192,7 @@ enum { #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ +#define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ struct cp_control { int reason; @@ -311,6 +312,7 @@ struct discard_policy { bool sync; /* submit discard with REQ_SYNC flag */ bool ordered; /* issue discard by lba order */ unsigned int granularity; /* discard granularity */ + int timeout; /* discard timeout for put_super */ }; struct discard_cmd_control { @@ -1111,6 +1113,7 @@ enum { DISCARD_TIME, GC_TIME, DISABLE_TIME, + UMOUNT_DISCARD_TIMEOUT, MAX_TIME, }; @@ -3007,7 +3010,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); -bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi); void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9b79056d705d..5b2b9be6f28d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1037,6 +1037,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->timeout = 0; if (discard_type == DPOLICY_BG) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; @@ -1424,7 +1425,14 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, int i, issued = 0; bool io_interrupted = false; + if (dpolicy->timeout != 0) + f2fs_update_time(sbi, dpolicy->timeout); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (dpolicy->timeout != 0 && + f2fs_time_over(sbi, dpolicy->timeout)) + break; + if (i + 1 < dpolicy->granularity) break; @@ -1611,7 +1619,7 @@ void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) } /* This comes from f2fs_put_super */ -bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_policy dpolicy; @@ -1619,6 +1627,7 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); + dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT; __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 141cac14fc67..95a3445f97ca 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1048,7 +1048,7 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - dropped = f2fs_wait_discard_bios(sbi); + dropped = f2fs_issue_discard_timeout(sbi); if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) && !sbi->discard_blks && !dropped) { @@ -2706,6 +2706,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL; + sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] = + DEF_UMOUNT_DISCARD_TIMEOUT; clear_sbi_flag(sbi, SBI_NEED_FSCK); for (i = 0; i < NR_COUNT_TYPE; i++) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index a609c0abbdd0..d7b47662a0aa 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -426,6 +426,8 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval, interval_time[DISCARD_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, + umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); @@ -483,6 +485,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(idle_interval), ATTR_LIST(discard_idle_interval), ATTR_LIST(gc_idle_interval), + ATTR_LIST(umount_discard_timeout), ATTR_LIST(iostat_enable), ATTR_LIST(readdir_ra), ATTR_LIST(gc_pin_file_thresh), From 26fe3e8cc3a23131a5165ac571166d6336b42f78 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 24 Jan 2019 17:48:38 -0800 Subject: [PATCH 014/134] f2fs: add quick mode of checkpoint=disable for QA This mode returns mount() quickly with EAGAIN. We can trigger this by shutdown(F2FS_GOING_DOWN_NEED_FSCK). Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 +++++ fs/f2fs/f2fs.h | 2 ++ fs/f2fs/file.c | 6 +++--- fs/f2fs/segment.c | 3 +++ fs/f2fs/super.c | 5 +++++ include/linux/f2fs_fs.h | 1 + 6 files changed, 19 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d27d3c4458e4..228d0e685b8d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1259,6 +1259,11 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) else __clear_ckpt_flags(ckpt, CP_DISABLED_FLAG); + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK)) + __set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); + else + __clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); + if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); else diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 417585b0a494..f2bfe067aef7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -192,6 +192,7 @@ enum { #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ +#define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ #define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ struct cp_control { @@ -1102,6 +1103,7 @@ enum { SBI_IS_SHUTDOWN, /* shutdown by ioctl */ SBI_IS_RECOVERED, /* recovered orphan/data */ SBI_CP_DISABLED, /* CP was disabled last mount */ + SBI_CP_DISABLED_QUICK, /* CP was disabled quickly */ SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */ SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2a06e0f19a9a..149710854a3a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1972,11 +1972,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) break; case F2FS_GOING_DOWN_NEED_FSCK: set_sbi_flag(sbi, SBI_NEED_FSCK); + set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + set_sbi_flag(sbi, SBI_IS_DIRTY); /* do checkpoint only */ ret = f2fs_sync_fs(sb, 1); - if (ret) - goto out; - break; + goto out; default: ret = -EINVAL; goto out; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5b2b9be6f28d..342b720fb4db 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -868,6 +868,9 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi) if (holes[DATA] > ovp || holes[NODE] > ovp) return -EAGAIN; + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) && + dirty_segments(sbi) > overprovision_segments(sbi)) + return -EAGAIN; return 0; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 95a3445f97ca..f28baa17e98c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3205,6 +3205,10 @@ try_onemore: if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_DISABLED_QUICK_FLAG)) { + set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_QUICK_INTERVAL; + } /* Initialize device list */ err = f2fs_scan_devices(sbi); @@ -3391,6 +3395,7 @@ skip_recovery: cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); + clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); return 0; free_meta: diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index d7711048ef93..d6befe1f9dc7 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -116,6 +116,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_DISABLED_QUICK_FLAG 0x00002000 #define CP_DISABLED_FLAG 0x00001000 #define CP_QUOTA_NEED_FSCK_FLAG 0x00000800 #define CP_LARGE_NAT_BITMAP_FLAG 0x00000400 From 78a402a25b050119fe0363bff075c3adb0e8a037 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Jan 2019 09:12:13 -0800 Subject: [PATCH 015/134] f2fs: try to keep CP_TRIMMED_FLAG after successful umount If every discard were issued successfully, we can avoid further discard. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 342b720fb4db..fdd8cd21522f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1063,6 +1063,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, } else if (discard_type == DPOLICY_UMOUNT) { dpolicy->max_requests = UINT_MAX; dpolicy->io_aware = false; + /* we need to issue all to keep CP_TRIMMED_FLAG */ + dpolicy->granularity = 1; } } From 4b7eff5879635a0a52e4bf9f8f916d76fe9b20f0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Jan 2019 10:26:39 -0800 Subject: [PATCH 016/134] f2fs: don't wake up too frequently, if there is lots of IOs Otherwise, it wakes up discard thread which will sleep again by busy IOs in a loop. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index a77f76f528b6..5c7ed0442d6e 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -865,7 +865,7 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) } } mutex_unlock(&dcc->cmd_lock); - if (!wakeup) + if (!wakeup || !is_idle(sbi, DISCARD_TIME)) return; wake_up: dcc->discard_wake = 1; From d37a8cdf6b7bd626830f8fdc0562e4c12eaef11e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Jan 2019 12:05:25 -0800 Subject: [PATCH 017/134] f2fs: avoid null pointer exception in dcc_info If dcc_info is not set yet, we can get null pointer panic. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f2bfe067aef7..1fa0013b8669 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2162,10 +2162,17 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type) get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || get_pages(sbi, F2FS_WB_CP_DATA) || get_pages(sbi, F2FS_DIO_READ) || - get_pages(sbi, F2FS_DIO_WRITE) || - atomic_read(&SM_I(sbi)->dcc_info->queued_discard) || - atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) + get_pages(sbi, F2FS_DIO_WRITE)) return false; + + if (SM_I(sbi) && SM_I(sbi)->dcc_info && + atomic_read(&SM_I(sbi)->dcc_info->queued_discard)) + return false; + + if (SM_I(sbi) && SM_I(sbi)->fcc_info && + atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) + return false; + return f2fs_time_over(sbi, type); } From b78c5986c2d91da1313362861e84adcb8eb31bc8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 27 Jan 2019 17:59:53 -0800 Subject: [PATCH 018/134] f2fs: flush quota blocks after turnning it off After quota_off, we'll get some dirty blocks. If put_super don't have a chance to flush them by checkpoint, it causes NULL pointer exception in end_io after iput(node_inode). (e.g., by checkpoint=disable) Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f28baa17e98c..f18fab590116 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2026,6 +2026,12 @@ void f2fs_quota_off_umount(struct super_block *sb) set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); } } + /* + * In case of checkpoint=disable, we must flush quota blocks. + * This can cause NULL exception for node_inode in end_io, since + * put_super already dropped it. + */ + sync_filesystem(sb); } static void f2fs_truncate_quota_inode_pages(struct super_block *sb) From 90ba3ec9641d50a85ea8567af371ddd84e5c610a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 22 Jan 2019 14:04:33 -0800 Subject: [PATCH 019/134] f2fs: sync filesystem after roll-forward recovery Some works after roll-forward recovery can get an error which will release all the data structures. Let's flush them in order to make it clean. One possible corruption came from: [ 90.400500] list_del corruption. prev->next should be ffffffed1f566208, but was (null) [ 90.675349] Call trace: [ 90.677869] __list_del_entry_valid+0x94/0xb4 [ 90.682351] remove_dirty_inode+0xac/0x114 [ 90.686563] __f2fs_write_data_pages+0x6a8/0x6c8 [ 90.691302] f2fs_write_data_pages+0x40/0x4c [ 90.695695] do_writepages+0x80/0xf0 [ 90.699372] __writeback_single_inode+0xdc/0x4ac [ 90.704113] writeback_sb_inodes+0x280/0x440 [ 90.708501] wb_writeback+0x1b8/0x3d0 [ 90.712267] wb_workfn+0x1a8/0x4d4 [ 90.715765] process_one_work+0x1c0/0x3d4 [ 90.719883] worker_thread+0x224/0x344 [ 90.723739] kthread+0x120/0x130 [ 90.727055] ret_from_fork+0x10/0x18 Reported-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 +++-- fs/f2fs/node.c | 4 +++- fs/f2fs/super.c | 44 +++++++++++++++++++++++++++++++++----------- 3 files changed, 39 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 228d0e685b8d..47d720562b3a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -306,8 +306,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping, goto skip_write; /* collect a number of dirty meta pages and write together */ - if (wbc->for_kupdate || - get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) + if (wbc->sync_mode != WB_SYNC_ALL && + get_pages(sbi, F2FS_DIRTY_META) < + nr_pages_to_skip(sbi, META)) goto skip_write; /* if locked failed, cp will flush dirty pages instead */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 46fdaebd6185..e39ca804df6f 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1922,7 +1922,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, f2fs_balance_fs_bg(sbi); /* collect a number of dirty node pages and write together */ - if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) + if (wbc->sync_mode != WB_SYNC_ALL && + get_pages(sbi, F2FS_DIRTY_NODES) < + nr_pages_to_skip(sbi, NODE)) goto skip_write; if (wbc->sync_mode == WB_SYNC_ALL) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f18fab590116..445b0c8a56c0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1458,9 +1458,16 @@ static int f2fs_enable_quotas(struct super_block *sb); static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { + unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; - int err; + int err = 0; + int ret; + if (s_flags & SB_RDONLY) { + f2fs_msg(sbi->sb, KERN_ERR, + "checkpoint=disable on readonly fs"); + return -EINVAL; + } sbi->sb->s_flags |= SB_ACTIVE; f2fs_update_time(sbi, DISABLE_TIME); @@ -1468,18 +1475,24 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) while (!f2fs_time_over(sbi, DISABLE_TIME)) { mutex_lock(&sbi->gc_mutex); err = f2fs_gc(sbi, true, false, NULL_SEGNO); - if (err == -ENODATA) + if (err == -ENODATA) { + err = 0; break; + } if (err && err != -EAGAIN) - return err; + break; } - err = sync_filesystem(sbi->sb); - if (err) - return err; + ret = sync_filesystem(sbi->sb); + if (ret || err) { + err = ret ? ret: err; + goto restore_flag; + } - if (f2fs_disable_cp_again(sbi)) - return -EAGAIN; + if (f2fs_disable_cp_again(sbi)) { + err = -EAGAIN; + goto restore_flag; + } mutex_lock(&sbi->gc_mutex); cpc.reason = CP_PAUSE; @@ -1488,7 +1501,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) sbi->unusable_block_count = 0; mutex_unlock(&sbi->gc_mutex); - return 0; +restore_flag: + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + return err; } static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) @@ -3368,7 +3383,7 @@ skip_recovery: if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); if (err) - goto free_meta; + goto sync_free_meta; } else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) { f2fs_enable_checkpoint(sbi); } @@ -3381,7 +3396,7 @@ skip_recovery: /* After POR, we can run background GC thread.*/ err = f2fs_start_gc_thread(sbi); if (err) - goto free_meta; + goto sync_free_meta; } kvfree(options); @@ -3404,6 +3419,11 @@ skip_recovery: clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); return 0; +sync_free_meta: + /* safe to flush all the data */ + sync_filesystem(sbi->sb); + retry = false; + free_meta: #ifdef CONFIG_QUOTA f2fs_truncate_quota_inode_pages(sb); @@ -3417,6 +3437,8 @@ free_meta: * falls into an infinite loop in f2fs_sync_meta_pages(). */ truncate_inode_pages_final(META_MAPPING(sbi)); + /* evict some inodes being cached by GC */ + evict_inodes(sb); f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); From c34ed1e18ecb74b62e2b57c96ec4a99a4cb71ae8 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 25 Jan 2019 20:11:39 +0800 Subject: [PATCH 020/134] f2fs: use xattr_prefix to wrap up Let's use xattr_prefix instead of open code. No logic changes. Signed-off-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 18d5ffbc5e8c..fa620d31ea5f 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -538,7 +538,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) if (!handler || (handler->list && !handler->list(dentry))) continue; - prefix = handler->prefix ?: handler->name; + prefix = xattr_prefix(handler); prefix_len = strlen(prefix); size = prefix_len + entry->e_name_len + 1; if (buffer) { From f606c8b390684154bbb19df052b9236a1ba65a76 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 25 Jan 2019 15:35:01 +0800 Subject: [PATCH 021/134] f2fs: fix typos in code comments lengh -> length Signed-off-by: Geliang Tang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index d6befe1f9dc7..8d57aaee8166 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -187,7 +187,7 @@ struct f2fs_orphan_block { struct f2fs_extent { __le32 fofs; /* start file offset of the extent */ __le32 blk; /* start block address of the extent */ - __le32 len; /* lengh of the extent */ + __le32 len; /* length of the extent */ } __packed; #define F2FS_NAME_LEN 255 @@ -512,7 +512,7 @@ typedef __le32 f2fs_hash_t; struct f2fs_dir_entry { __le32 hash_code; /* hash code of file name */ __le32 ino; /* inode number */ - __le16 name_len; /* lengh of file name */ + __le16 name_len; /* length of file name */ __u8 file_type; /* file type */ } __packed; From 51f50099a25892f1ed758f137c683c12081f3f4c Mon Sep 17 00:00:00 2001 From: zhengliang Date: Thu, 24 Jan 2019 20:57:03 +0800 Subject: [PATCH 022/134] f2fs: fix to data block override node segment by mistake MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following race could lead to data block override node segment by mistake. Task A | Task B | Task C | Task D ======= | ======== |========== | ========= open file | | | white file | | | submit bio | | | wait io complete | | | | remove file | | ........ | iput_final | | | | sync | | | do checkpoint | | | data segment free | | | | create file1 | | | allocate node segment(if it is the same segment freed by Task C) f2fs_write_end_io | | | So we need to guarantee io complete before truncate inode in f2fs_drop_inode. Signed-off-by: Zheng Liang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 445b0c8a56c0..b5127a1440d9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -915,6 +915,10 @@ static int f2fs_drop_inode(struct inode *inode) sb_start_intwrite(inode->i_sb); f2fs_i_size_write(inode, 0); + f2fs_submit_merged_write_cond(F2FS_I_SB(inode), + inode, NULL, 0, DATA); + truncate_inode_pages_final(inode->i_mapping); + if (F2FS_HAS_BLOCKS(inode)) f2fs_truncate(inode); From b62b71523d5e8242078dc10a48c056c2675674af Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 24 Jan 2019 17:18:07 +0800 Subject: [PATCH 023/134] f2fs: fix to document inline_xattr_size option We missed to add document for inline_xattr_size mount option in f2fs.txt, add it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index fedbea893ee7..35c777b6e69b 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -126,6 +126,8 @@ disable_ext_identify Disable the extension list configured by mkfs, so f2fs does not aware of cold files such as media files. inline_xattr Enable the inline xattrs feature. noinline_xattr Disable the inline xattrs feature. +inline_xattr_size=%u Support configuring inline xattr size, it depends on + flexible inline xattr feature. inline_data Enable the inline data feature: New created small(<~3.4k) files can be written into inode block. inline_dentry Enable the inline dir feature: data in new created From 73d797ef482b163f327ae1a16d17aaecd77a798f Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Wed, 23 Jan 2019 15:49:44 +0800 Subject: [PATCH 024/134] f2fs: jump to label 'free_node_inode' when failing from d_make_root() When sb->s_root is NULL dput() will do nothing, so jump to label 'free_node_inode' instead of lable 'free_root_inode' when failing from d_make_root(). Signed-off-by: Chengguang Xu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b5127a1440d9..b3b9a74303ba 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3321,7 +3321,7 @@ try_onemore: sb->s_root = d_make_root(root); /* allocate root dentry */ if (!sb->s_root) { err = -ENOMEM; - goto free_root_inode; + goto free_node_inode; } err = f2fs_register_sysfs(sbi); From 0c8c07140e00139e7aa76e2223eebee3a7c24ebd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 24 Jan 2019 15:18:47 +0800 Subject: [PATCH 025/134] f2fs: fix to check inline_xattr_size boundary correctly We use below condition to check inline_xattr_size boundary: if (!F2FS_OPTION(sbi).inline_xattr_size || F2FS_OPTION(sbi).inline_xattr_size >= DEF_ADDRS_PER_INODE - F2FS_TOTAL_EXTRA_ATTR_SIZE - DEF_INLINE_RESERVED_SIZE - DEF_MIN_INLINE_SIZE) There is there problems in that check: - we should allow inline_xattr_size equaling to min size of inline {data,dentry} area. - F2FS_TOTAL_EXTRA_ATTR_SIZE and inline_xattr_size are based on different size unit, previous one is 4 bytes, latter one is 1 bytes. - DEF_MIN_INLINE_SIZE only indicate min size of inline data area, however, we need to consider min size of inline dentry area as well, minimal inline dentry should at least contain two entries: '.' and '..', so that min inline_dentry size is 40 bytes. .bitmap 1 * 1 = 1 .reserved 1 * 1 = 1 .dentry 11 * 2 = 22 .filename 8 * 2 = 16 total 40 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - fs/f2fs/super.c | 10 +++++----- include/linux/f2fs_fs.h | 13 +++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1fa0013b8669..c696b27d8cc4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -460,7 +460,6 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 -#define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); static inline int get_inline_xattr_addrs(struct inode *inode); #define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b3b9a74303ba..3166b7266f9e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -835,11 +835,11 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } if (!F2FS_OPTION(sbi).inline_xattr_size || - F2FS_OPTION(sbi).inline_xattr_size >= - DEF_ADDRS_PER_INODE - - F2FS_TOTAL_EXTRA_ATTR_SIZE - - DEF_INLINE_RESERVED_SIZE - - DEF_MIN_INLINE_SIZE) { + F2FS_OPTION(sbi).inline_xattr_size > + DEF_ADDRS_PER_INODE - + F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - + DEF_INLINE_RESERVED_SIZE - + MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) { f2fs_msg(sb, KERN_ERR, "inline xattr size is out of range"); return -EINVAL; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 8d57aaee8166..666db8eb71e0 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -490,12 +490,12 @@ typedef __le32 f2fs_hash_t; /* * space utilization of regular dentry and inline dentry (w/o extra reservation) - * regular dentry inline dentry - * bitmap 1 * 27 = 27 1 * 23 = 23 - * reserved 1 * 3 = 3 1 * 7 = 7 - * dentry 11 * 214 = 2354 11 * 182 = 2002 - * filename 8 * 214 = 1712 8 * 182 = 1456 - * total 4096 3488 + * regular dentry inline dentry (def) inline dentry (min) + * bitmap 1 * 27 = 27 1 * 23 = 23 1 * 1 = 1 + * reserved 1 * 3 = 3 1 * 7 = 7 1 * 1 = 1 + * dentry 11 * 214 = 2354 11 * 182 = 2002 11 * 2 = 22 + * filename 8 * 214 = 1712 8 * 182 = 1456 8 * 2 = 16 + * total 4096 3488 40 * * Note: there are more reserved space in inline dentry than in regular * dentry, when converting inline dentry we should handle this carefully. @@ -507,6 +507,7 @@ typedef __le32 f2fs_hash_t; #define SIZE_OF_RESERVED (PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \ F2FS_SLOT_LEN) * \ NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP)) +#define MIN_INLINE_DENTRY_SIZE 40 /* just include '.' and '..' entries */ /* One directory entry slot representing F2FS_SLOT_LEN-sized file name */ struct f2fs_dir_entry { From fb157acbc0ac07452354773bf5c80afe2ef4cad1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 22 Jan 2019 20:18:06 +0800 Subject: [PATCH 026/134] f2fs: fix to avoid deadlock of atomic file operations Thread A Thread B - __fput - f2fs_release_file - drop_inmem_pages - mutex_lock(&fi->inmem_lock) - __revoke_inmem_pages - lock_page(page) - open - f2fs_setattr - truncate_setsize - truncate_inode_pages_range - lock_page(page) - truncate_cleanup_page - f2fs_invalidate_page - drop_inmem_page - mutex_lock(&fi->inmem_lock); We may encounter above ABBA deadlock as reported by Kyungtae Kim: I'm reporting a bug in linux-4.17.19: "INFO: task hung in drop_inmem_page" (no reproducer) I think this might be somehow related to the following: https://groups.google.com/forum/#!searchin/syzkaller-bugs/INFO$3A$20task$20hung$20in$20%7Csort:date/syzkaller-bugs/c6soBTrdaIo/AjAzPeIzCgAJ ========================================= INFO: task syz-executor7:10822 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor7 D27024 10822 6346 0x00000004 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 schedule_preempt_disabled+0x18/0x30 kernel/sched/core.c:3617 __mutex_lock_common kernel/locking/mutex.c:833 [inline] __mutex_lock+0x5bd/0x1410 kernel/locking/mutex.c:893 mutex_lock_nested+0x1b/0x20 kernel/locking/mutex.c:908 drop_inmem_page+0xcb/0x810 fs/f2fs/segment.c:327 f2fs_invalidate_page+0x337/0x5e0 fs/f2fs/data.c:2401 do_invalidatepage mm/truncate.c:165 [inline] truncate_cleanup_page+0x261/0x330 mm/truncate.c:187 truncate_inode_pages_range+0x552/0x1610 mm/truncate.c:367 truncate_inode_pages mm/truncate.c:478 [inline] truncate_pagecache+0x6d/0x90 mm/truncate.c:801 truncate_setsize+0x81/0xa0 mm/truncate.c:826 f2fs_setattr+0x44f/0x1270 fs/f2fs/file.c:781 notify_change+0xa62/0xe80 fs/attr.c:313 do_truncate+0x12e/0x1e0 fs/open.c:63 do_last fs/namei.c:2955 [inline] path_openat+0x2042/0x29f0 fs/namei.c:3505 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540 do_sys_open+0x35e/0x4e0 fs/open.c:1101 __do_sys_open fs/open.c:1119 [inline] __se_sys_open fs/open.c:1114 [inline] __x64_sys_open+0x89/0xc0 fs/open.c:1114 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f734e459c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002 RAX: ffffffffffffffda RBX: 00007f734e45a6cc RCX: 00000000004497b9 RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080 RBP: 000000000071bea0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e45a700 INFO: task syz-executor7:10858 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor7 D28880 10858 6346 0x00000004 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 __rwsem_down_write_failed_common kernel/locking/rwsem-xadd.c:565 [inline] rwsem_down_write_failed+0x5e6/0xc90 kernel/locking/rwsem-xadd.c:594 call_rwsem_down_write_failed+0x17/0x30 arch/x86/lib/rwsem.S:117 __down_write arch/x86/include/asm/rwsem.h:142 [inline] down_write+0x58/0xa0 kernel/locking/rwsem.c:72 inode_lock include/linux/fs.h:713 [inline] do_truncate+0x120/0x1e0 fs/open.c:61 do_last fs/namei.c:2955 [inline] path_openat+0x2042/0x29f0 fs/namei.c:3505 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540 do_sys_open+0x35e/0x4e0 fs/open.c:1101 __do_sys_open fs/open.c:1119 [inline] __se_sys_open fs/open.c:1114 [inline] __x64_sys_open+0x89/0xc0 fs/open.c:1114 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f734e3b4c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002 RAX: ffffffffffffffda RBX: 00007f734e3b56cc RCX: 00000000004497b9 RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080 RBP: 000000000071c238 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e3b5700 INFO: task syz-executor5:10829 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor5 D28760 10829 6308 0x80000002 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 io_schedule+0x21/0x80 kernel/sched/core.c:5179 wait_on_page_bit_common mm/filemap.c:1100 [inline] __lock_page+0x2b5/0x390 mm/filemap.c:1273 lock_page include/linux/pagemap.h:483 [inline] __revoke_inmem_pages+0xb35/0x11c0 fs/f2fs/segment.c:231 drop_inmem_pages+0xa3/0x3e0 fs/f2fs/segment.c:306 f2fs_release_file+0x2c7/0x330 fs/f2fs/file.c:1556 __fput+0x2c7/0x780 fs/file_table.c:209 ____fput+0x1a/0x20 fs/file_table.c:243 task_work_run+0x151/0x1d0 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x8ba/0x30a0 kernel/exit.c:865 do_group_exit+0x13b/0x3a0 kernel/exit.c:968 get_signal+0x6bb/0x1650 kernel/signal.c:2482 do_signal+0x84/0x1b70 arch/x86/kernel/signal.c:810 exit_to_usermode_loop+0x155/0x190 arch/x86/entry/common.c:162 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline] syscall_return_slowpath arch/x86/entry/common.c:265 [inline] do_syscall_64+0x445/0x4e0 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f1c68e74ce8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: fffffffffffffe00 RBX: 000000000071bf80 RCX: 00000000004497b9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000071bf80 RBP: 000000000071bf80 R08: 0000000000000000 R09: 000000000071bf58 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007f1c68e759c0 R15: 00007f1c68e75700 This patch tries to use trylock_page to mitigate such deadlock condition for fix. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fdd8cd21522f..3602acee4985 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -215,7 +215,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) } static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover) + struct list_head *head, bool drop, bool recover, + bool trylock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inmem_pages *cur, *tmp; @@ -227,7 +228,16 @@ static int __revoke_inmem_pages(struct inode *inode, if (drop) trace_f2fs_commit_inmem_page(page, INMEM_DROP); - lock_page(page); + if (trylock) { + /* + * to avoid deadlock in between page lock and + * inmem_lock. + */ + if (!trylock_page(page)) + continue; + } else { + lock_page(page); + } f2fs_wait_on_page_writeback(page, DATA, true, true); @@ -318,13 +328,19 @@ void f2fs_drop_inmem_pages(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - mutex_lock(&fi->inmem_lock); - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - mutex_unlock(&fi->inmem_lock); + while (!list_empty(&fi->inmem_pages)) { + mutex_lock(&fi->inmem_lock); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, true); + + if (list_empty(&fi->inmem_pages)) { + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + mutex_unlock(&fi->inmem_lock); + } + } clear_inode_flag(inode, FI_ATOMIC_FILE); fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; @@ -429,12 +445,15 @@ retry: * recovery or rewrite & commit last transaction. For other * error number, revoking was done by filesystem itself. */ - err = __revoke_inmem_pages(inode, &revoke_list, false, true); + err = __revoke_inmem_pages(inode, &revoke_list, + false, true, false); /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, false); } else { - __revoke_inmem_pages(inode, &revoke_list, false, false); + __revoke_inmem_pages(inode, &revoke_list, + false, false, false); } return err; From 72bfaa916781ee355b65f8b1c74439a50f574872 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 2 Feb 2019 17:33:01 +0800 Subject: [PATCH 027/134] f2fs: fix potential data inconsistence of checkpoint Previously, we changed lock from cp_rwsem to node_change, it solved the deadlock issue which was caused by below race condition: Thread A Thread B - f2fs_setattr - f2fs_lock_op -- read_lock - dquot_transfer - __dquot_transfer - dquot_acquire - commit_dqblk - f2fs_quota_write - f2fs_write_begin - f2fs_write_failed - write_checkpoint - block_operations - f2fs_lock_all -- write_lock - f2fs_truncate_blocks - f2fs_lock_op -- read_lock But it breaks the sematics of cp_rwsem, in other callers like: - f2fs_file_write_iter -> f2fs_write_begin -> f2fs_write_failed - f2fs_direct_IO -> f2fs_write_failed We allow to truncate dnode w/o cp_rwsem held, result in incorrect sit bitmap update, which can cause further data corruption. So this patch reverts previous fix implementation, and try to fix deadlock by skipping calling f2fs_truncate_blocks() in f2fs_write_failed() only for quota file, and keep the preallocated data/node in the tail of quota file, we can expecte that the preallocated space can be used to store quota info latter soon. Fixes: af033b2aa8a8 ("f2fs: guarantee journalled quota data by checkpoint") Signed-off-by: Gao Xiang Signed-off-by: Sheng Yong Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- fs/f2fs/f2fs.h | 3 +-- fs/f2fs/file.c | 14 ++++++-------- fs/f2fs/inline.c | 4 ++-- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c0ee1c06e70e..b94d4eca22f4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2311,7 +2311,8 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); - f2fs_truncate_blocks(inode, i_size, true, true); + if (!IS_NOQUOTA(inode)) + f2fs_truncate_blocks(inode, i_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c696b27d8cc4..f5814dc68fa5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2843,8 +2843,7 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi, */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void f2fs_truncate_data_blocks(struct dnode_of_data *dn); -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, - bool buf_write); +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); int f2fs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 149710854a3a..5df1253c2010 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -589,8 +589,7 @@ truncate_out: return 0; } -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, - bool buf_write) +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; @@ -598,7 +597,6 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, int count = 0, err = 0; struct page *ipage; bool truncate_page = false; - int flag = buf_write ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; trace_f2fs_truncate_blocks_enter(inode, from); @@ -608,7 +606,7 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, goto free_partial; if (lock) - __do_map_lock(sbi, flag, true); + f2fs_lock_op(sbi); ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { @@ -646,7 +644,7 @@ free_next: err = f2fs_truncate_inode_blocks(inode, free_from); out: if (lock) - __do_map_lock(sbi, flag, false); + f2fs_unlock_op(sbi); free_partial: /* lastly zero out the first data page */ if (!err) @@ -681,7 +679,7 @@ int f2fs_truncate(struct inode *inode) return err; } - err = f2fs_truncate_blocks(inode, i_size_read(inode), true, false); + err = f2fs_truncate_blocks(inode, i_size_read(inode), true); if (err) return err; @@ -1262,7 +1260,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) new_size = i_size_read(inode) - len; truncate_pagecache(inode, new_size); - ret = f2fs_truncate_blocks(inode, new_size, true, false); + ret = f2fs_truncate_blocks(inode, new_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); @@ -1447,7 +1445,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); down_write(&F2FS_I(inode)->i_mmap_sem); - ret = f2fs_truncate_blocks(inode, i_size_read(inode), true, false); + ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); up_write(&F2FS_I(inode)->i_mmap_sem); if (ret) return ret; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 9804cdcf976e..3ebae34a8b96 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -298,7 +298,7 @@ process_inline: clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - if (f2fs_truncate_blocks(inode, 0, false, false)) + if (f2fs_truncate_blocks(inode, 0, false)) return false; goto process_inline; } @@ -470,7 +470,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) return 0; punch_dentry_pages: truncate_inode_pages(&dir->i_data, 0); - f2fs_truncate_blocks(dir, 0, false, false); + f2fs_truncate_blocks(dir, 0, false); f2fs_remove_dirty_inode(dir); return err; } From ad65135dc447f8348ef06552af5a1420a819ee61 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Mon, 4 Feb 2019 13:36:53 +0530 Subject: [PATCH 028/134] f2fs: do not use mutex lock in atomic context Fix below warning coming because of using mutex lock in atomic context. BUG: sleeping function called from invalid context at kernel/locking/mutex.c:98 in_atomic(): 1, irqs_disabled(): 0, pid: 585, name: sh Preemption disabled at: __radix_tree_preload+0x28/0x130 Call trace: dump_backtrace+0x0/0x2b4 show_stack+0x20/0x28 dump_stack+0xa8/0xe0 ___might_sleep+0x144/0x194 __might_sleep+0x58/0x8c mutex_lock+0x2c/0x48 f2fs_trace_pid+0x88/0x14c f2fs_set_node_page_dirty+0xd0/0x184 Do not use f2fs_radix_tree_insert() to avoid doing cond_resched() with spin_lock() acquired. Signed-off-by: Sahitya Tummala Signed-off-by: Jaegeuk Kim --- fs/f2fs/trace.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index ce2a5eb210b6..d0ab533a9ce8 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -14,7 +14,7 @@ #include "trace.h" static RADIX_TREE(pids, GFP_ATOMIC); -static struct mutex pids_lock; +static spinlock_t pids_lock; static struct last_io_info last_io; static inline void __print_last_io(void) @@ -58,23 +58,29 @@ void f2fs_trace_pid(struct page *page) set_page_private(page, (unsigned long)pid); +retry: if (radix_tree_preload(GFP_NOFS)) return; - mutex_lock(&pids_lock); + spin_lock(&pids_lock); p = radix_tree_lookup(&pids, pid); if (p == current) goto out; if (p) radix_tree_delete(&pids, pid); - f2fs_radix_tree_insert(&pids, pid, current); + if (radix_tree_insert(&pids, pid, current)) { + spin_unlock(&pids_lock); + radix_tree_preload_end(); + cond_resched(); + goto retry; + } trace_printk("%3x:%3x %4x %-16s\n", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), pid, current->comm); out: - mutex_unlock(&pids_lock); + spin_unlock(&pids_lock); radix_tree_preload_end(); } @@ -119,7 +125,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) void f2fs_build_trace_ios(void) { - mutex_init(&pids_lock); + spin_lock_init(&pids_lock); } #define PIDVEC_SIZE 128 @@ -147,7 +153,7 @@ void f2fs_destroy_trace_ios(void) pid_t next_pid = 0; unsigned int found; - mutex_lock(&pids_lock); + spin_lock(&pids_lock); while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { unsigned idx; @@ -155,5 +161,5 @@ void f2fs_destroy_trace_ios(void) for (idx = 0; idx < found; idx++) radix_tree_delete(&pids, pid[idx]); } - mutex_unlock(&pids_lock); + spin_unlock(&pids_lock); } From cd1d2b21131ce65f3eebb5de0524c5d71c267cf5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 15 Feb 2019 20:57:53 -0800 Subject: [PATCH 029/134] Revert "f2fs: fix to check inline_xattr_size boundary correctly" This reverts commit 802a643228462e0e3f6e84977d912f1871f61467. --- fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 10 +++++----- include/linux/f2fs_fs.h | 13 ++++++------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f5814dc68fa5..5c0da6c4f187 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -460,6 +460,7 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 +#define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); static inline int get_inline_xattr_addrs(struct inode *inode); #define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3166b7266f9e..b3b9a74303ba 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -835,11 +835,11 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } if (!F2FS_OPTION(sbi).inline_xattr_size || - F2FS_OPTION(sbi).inline_xattr_size > - DEF_ADDRS_PER_INODE - - F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - - DEF_INLINE_RESERVED_SIZE - - MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) { + F2FS_OPTION(sbi).inline_xattr_size >= + DEF_ADDRS_PER_INODE - + F2FS_TOTAL_EXTRA_ATTR_SIZE - + DEF_INLINE_RESERVED_SIZE - + DEF_MIN_INLINE_SIZE) { f2fs_msg(sb, KERN_ERR, "inline xattr size is out of range"); return -EINVAL; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 666db8eb71e0..8d57aaee8166 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -490,12 +490,12 @@ typedef __le32 f2fs_hash_t; /* * space utilization of regular dentry and inline dentry (w/o extra reservation) - * regular dentry inline dentry (def) inline dentry (min) - * bitmap 1 * 27 = 27 1 * 23 = 23 1 * 1 = 1 - * reserved 1 * 3 = 3 1 * 7 = 7 1 * 1 = 1 - * dentry 11 * 214 = 2354 11 * 182 = 2002 11 * 2 = 22 - * filename 8 * 214 = 1712 8 * 182 = 1456 8 * 2 = 16 - * total 4096 3488 40 + * regular dentry inline dentry + * bitmap 1 * 27 = 27 1 * 23 = 23 + * reserved 1 * 3 = 3 1 * 7 = 7 + * dentry 11 * 214 = 2354 11 * 182 = 2002 + * filename 8 * 214 = 1712 8 * 182 = 1456 + * total 4096 3488 * * Note: there are more reserved space in inline dentry than in regular * dentry, when converting inline dentry we should handle this carefully. @@ -507,7 +507,6 @@ typedef __le32 f2fs_hash_t; #define SIZE_OF_RESERVED (PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \ F2FS_SLOT_LEN) * \ NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP)) -#define MIN_INLINE_DENTRY_SIZE 40 /* just include '.' and '..' entries */ /* One directory entry slot representing F2FS_SLOT_LEN-sized file name */ struct f2fs_dir_entry { From d34a48c54c315f4f33b193c5657b7394867062e4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 26 Feb 2019 09:47:34 -0800 Subject: [PATCH 030/134] Revert "f2fs: fix to avoid deadlock of atomic file operations" This reverts commit f3ac182210162c7e76997a8566a7f9869349f3d8. --- fs/f2fs/segment.c | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3602acee4985..fdd8cd21522f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -215,8 +215,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) } static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover, - bool trylock) + struct list_head *head, bool drop, bool recover) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inmem_pages *cur, *tmp; @@ -228,16 +227,7 @@ static int __revoke_inmem_pages(struct inode *inode, if (drop) trace_f2fs_commit_inmem_page(page, INMEM_DROP); - if (trylock) { - /* - * to avoid deadlock in between page lock and - * inmem_lock. - */ - if (!trylock_page(page)) - continue; - } else { - lock_page(page); - } + lock_page(page); f2fs_wait_on_page_writeback(page, DATA, true, true); @@ -328,19 +318,13 @@ void f2fs_drop_inmem_pages(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - while (!list_empty(&fi->inmem_pages)) { - mutex_lock(&fi->inmem_lock); - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, true); - - if (list_empty(&fi->inmem_pages)) { - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - mutex_unlock(&fi->inmem_lock); - } - } + mutex_lock(&fi->inmem_lock); + __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; @@ -445,15 +429,12 @@ retry: * recovery or rewrite & commit last transaction. For other * error number, revoking was done by filesystem itself. */ - err = __revoke_inmem_pages(inode, &revoke_list, - false, true, false); + err = __revoke_inmem_pages(inode, &revoke_list, false, true); /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, false); + __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); } else { - __revoke_inmem_pages(inode, &revoke_list, - false, false, false); + __revoke_inmem_pages(inode, &revoke_list, false, false); } return err; From 5cf19c008d09b7f4f00bb2658a93e490f1b5856f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 15 Feb 2019 00:08:25 +0800 Subject: [PATCH 031/134] f2fs: fix to check inline_xattr_size boundary correctly We use below condition to check inline_xattr_size boundary: if (!F2FS_OPTION(sbi).inline_xattr_size || F2FS_OPTION(sbi).inline_xattr_size >= DEF_ADDRS_PER_INODE - F2FS_TOTAL_EXTRA_ATTR_SIZE - DEF_INLINE_RESERVED_SIZE - DEF_MIN_INLINE_SIZE) There is there problems in that check: - we should allow inline_xattr_size equaling to min size of inline {data,dentry} area. - F2FS_TOTAL_EXTRA_ATTR_SIZE and inline_xattr_size are based on different size unit, previous one is 4 bytes, latter one is 1 bytes. - DEF_MIN_INLINE_SIZE only indicate min size of inline data area, however, we need to consider min size of inline dentry area as well, minimal inline dentry should at least contain two entries: '.' and '..', so that min inline_dentry size is 40 bytes. .bitmap 1 * 1 = 1 .reserved 1 * 1 = 1 .dentry 11 * 2 = 22 .filename 8 * 2 = 16 total 40 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - fs/f2fs/super.c | 13 +++++++------ include/linux/f2fs_fs.h | 13 +++++++------ 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5c0da6c4f187..f5814dc68fa5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -460,7 +460,6 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 -#define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); static inline int get_inline_xattr_addrs(struct inode *inode); #define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b3b9a74303ba..7432d1a7938a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -834,12 +834,13 @@ static int parse_options(struct super_block *sb, char *options) "set with inline_xattr option"); return -EINVAL; } - if (!F2FS_OPTION(sbi).inline_xattr_size || - F2FS_OPTION(sbi).inline_xattr_size >= - DEF_ADDRS_PER_INODE - - F2FS_TOTAL_EXTRA_ATTR_SIZE - - DEF_INLINE_RESERVED_SIZE - - DEF_MIN_INLINE_SIZE) { + if (F2FS_OPTION(sbi).inline_xattr_size < + sizeof(struct f2fs_xattr_header) / sizeof(__le32) || + F2FS_OPTION(sbi).inline_xattr_size > + DEF_ADDRS_PER_INODE - + F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - + DEF_INLINE_RESERVED_SIZE - + MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) { f2fs_msg(sb, KERN_ERR, "inline xattr size is out of range"); return -EINVAL; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 8d57aaee8166..666db8eb71e0 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -490,12 +490,12 @@ typedef __le32 f2fs_hash_t; /* * space utilization of regular dentry and inline dentry (w/o extra reservation) - * regular dentry inline dentry - * bitmap 1 * 27 = 27 1 * 23 = 23 - * reserved 1 * 3 = 3 1 * 7 = 7 - * dentry 11 * 214 = 2354 11 * 182 = 2002 - * filename 8 * 214 = 1712 8 * 182 = 1456 - * total 4096 3488 + * regular dentry inline dentry (def) inline dentry (min) + * bitmap 1 * 27 = 27 1 * 23 = 23 1 * 1 = 1 + * reserved 1 * 3 = 3 1 * 7 = 7 1 * 1 = 1 + * dentry 11 * 214 = 2354 11 * 182 = 2002 11 * 2 = 22 + * filename 8 * 214 = 1712 8 * 182 = 1456 8 * 2 = 16 + * total 4096 3488 40 * * Note: there are more reserved space in inline dentry than in regular * dentry, when converting inline dentry we should handle this carefully. @@ -507,6 +507,7 @@ typedef __le32 f2fs_hash_t; #define SIZE_OF_RESERVED (PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \ F2FS_SLOT_LEN) * \ NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP)) +#define MIN_INLINE_DENTRY_SIZE 40 /* just include '.' and '..' entries */ /* One directory entry slot representing F2FS_SLOT_LEN-sized file name */ struct f2fs_dir_entry { From 72e41a1bb0d52dddb7c1ee12fa6ca20fd98cdde5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 15 Feb 2019 00:16:15 +0800 Subject: [PATCH 032/134] f2fs: don't allow negative ->write_io_size_bits As Dan reported: "We put an upper bound on ->write_io_size_bits but we don't have a lower bound." So let's add lower bound check for ->write_io_size_bits in parse_options(). [We don't allow configuring ->write_io_size_bits to zero, since at least we need to fill one dummy page for aligned IO.] Reported-by: Dan Carpenter Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7432d1a7938a..c415c8135296 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -586,7 +586,7 @@ static int parse_options(struct super_block *sb, char *options) case Opt_io_size_bits: if (args->from && match_int(args, &arg)) return -EINVAL; - if (arg > __ilog2_u32(BIO_MAX_PAGES)) { + if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_PAGES)) { f2fs_msg(sb, KERN_WARNING, "Not support %d, larger than %d", 1 << arg, BIO_MAX_PAGES); From 1b5eba010446b752b0038b8fa900b253bb057f9a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 5 Feb 2019 07:59:57 -0800 Subject: [PATCH 033/134] f2fs: don't clear CP_QUOTA_NEED_FSCK_FLAG If we met this once, let fsck.f2fs clear this only. Note that, this addresses all the subtle fault injection test. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 47d720562b3a..62a8770cb19b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1267,8 +1267,10 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); - else - __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + /* + * TODO: we count on fsck.f2fs to clear this flag until we figure out + * missing cases which clear it incorrectly. + */ if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); From 85cedd366c63a7672f38354f39329754df759ce1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 15 Feb 2019 19:04:38 -0800 Subject: [PATCH 034/134] f2fs: fix wrong #endif We have to cover whole headerfile with last #endif. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f5814dc68fa5..26c28a78f38b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3627,8 +3627,6 @@ extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, #define f2fs_build_fault_attr(sbi, rate, type) do { } while (0) #endif -#endif - static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) { #ifdef CONFIG_QUOTA @@ -3641,3 +3639,5 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) #endif return false; } + +#endif From 09081958cbe80fae92daded756751442cedc6281 Mon Sep 17 00:00:00 2001 From: Zeng Guangyue Date: Mon, 18 Feb 2019 14:26:41 +0800 Subject: [PATCH 035/134] f2fs: correct spelling mistake correct spelling mistake for "nunmber" Signed-off-by: Zeng Guangyue Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 666db8eb71e0..f5740423b002 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -285,7 +285,7 @@ enum { struct node_footer { __le32 nid; /* node id */ - __le32 ino; /* inode nunmber */ + __le32 ino; /* inode number */ __le32 flag; /* include cold/fsync/dentry marks and offset */ __le64 cp_ver; /* checkpoint version */ __le32 next_blkaddr; /* next node page block address */ From 4d1af3c6f4b65d5fc49c5eacbf3dc9d94fb4469a Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 19 Feb 2019 10:31:52 +0800 Subject: [PATCH 036/134] f2fs: silence VM_WARN_ON_ONCE in mempool_alloc Note that __GFP_ZERO is not supported for mempool_alloc, which also documented in the mempool_alloc comments. Signed-off-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b94d4eca22f4..b9ee3a450d57 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -298,9 +298,10 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, for (; start < F2FS_IO_SIZE(sbi); start++) { struct page *page = mempool_alloc(sbi->write_io_dummy, - GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL); + GFP_NOIO | __GFP_NOFAIL); f2fs_bug_on(sbi, !page); + zero_user_segment(page, 0, PAGE_SIZE); SetPagePrivate(page); set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE); lock_page(page); From 5fc60b5c67b80f5d4e65250358a2eb6665280253 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 19 Feb 2019 16:23:53 +0800 Subject: [PATCH 037/134] f2fs: fix to retry fill_super only if recovery failed With current retry mechanism in f2fs_fill_super, first fill_super fails due to no memory, then second fill_super runs w/o recovery, if we succeed, we may lose fsynced data, it doesn't make sense. Let's retry fill_super only if it occurs non-ENOMEM error during recovery. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c415c8135296..2dc279daf7f2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3053,10 +3053,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) struct f2fs_super_block *raw_super; struct inode *root; int err; - bool retry = true, need_fsck = false; + bool skip_recovery = false, need_fsck = false; char *options = NULL; int recovery, i, valid_super_block; struct curseg_info *seg_i; + int retry_cnt = 1; try_onemore: err = -EINVAL; @@ -3344,7 +3345,7 @@ try_onemore: goto free_meta; if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) - goto skip_recovery; + goto reset_checkpoint; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { @@ -3361,11 +3362,13 @@ try_onemore: if (need_fsck) set_sbi_flag(sbi, SBI_NEED_FSCK); - if (!retry) - goto skip_recovery; + if (skip_recovery) + goto reset_checkpoint; err = f2fs_recover_fsync_data(sbi, false); if (err < 0) { + if (err != -ENOMEM) + skip_recovery = true; need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); @@ -3381,7 +3384,7 @@ try_onemore: goto free_meta; } } -skip_recovery: +reset_checkpoint: /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -3427,7 +3430,7 @@ skip_recovery: sync_free_meta: /* safe to flush all the data */ sync_filesystem(sbi->sb); - retry = false; + retry_cnt = 0; free_meta: #ifdef CONFIG_QUOTA @@ -3487,8 +3490,8 @@ free_sbi: kvfree(sbi); /* give only one another chance */ - if (retry) { - retry = false; + if (retry_cnt > 0 && skip_recovery) { + retry_cnt--; shrink_dcache_sb(sb); goto try_onemore; } From 95484da8dd2366bf7f680061e62812b138a43c92 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 19 Feb 2019 17:08:18 +0800 Subject: [PATCH 038/134] f2fs: make fault injection covering __submit_flush_wait() This patch changes to allow failure of f2fs_bio_alloc() in __submit_flush_wait(), which can simulate flush error in checkpoint() for covering more error paths. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fdd8cd21522f..2537893e9466 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -542,9 +542,13 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { - struct bio *bio = f2fs_bio_alloc(sbi, 0, true); + struct bio *bio; int ret; + bio = f2fs_bio_alloc(sbi, 0, false); + if (!bio) + return -ENOMEM; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; bio_set_dev(bio, bdev); ret = submit_bio_wait(bio); From 3823b7d1081f438def137ce486a73f6e6d5e2fdc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 21 Feb 2019 20:37:14 +0800 Subject: [PATCH 039/134] f2fs: fix encrypted page memory leak For IPU path of f2fs_do_write_data_page(), in its error path, we need to release encrypted page and fscrypt context, otherwise it will cause memory leak. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b9ee3a450d57..91dad12942b4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1860,8 +1860,13 @@ got_it: if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); err = f2fs_inplace_write_data(fio); - if (err && PageWriteback(page)) - end_page_writeback(page); + if (err) { + if (f2fs_encrypted_file(inode)) + fscrypt_pullback_bio_page(&fio->encrypted_page, + true); + if (PageWriteback(page)) + end_page_writeback(page); + } trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); return err; From 357040ed18b6b7d83cf1734ec4bf1026267de4bc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 21 Feb 2019 20:40:13 +0800 Subject: [PATCH 040/134] f2fs: fix to update iostat correctly in IPU path In error path of IPU, we didn't account iostat correctly, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2537893e9466..d74359325da6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3182,10 +3182,10 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) stat_inc_inplace_blocks(fio->sbi); err = f2fs_submit_page_bio(fio); - if (!err) + if (!err) { update_device_state(fio); - - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + } return err; } From c6fcc42aaf5ff55cbbf12cd45f29c3441bd622a0 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 21 Feb 2019 12:57:35 +0800 Subject: [PATCH 041/134] f2fs: no need to take page lock in readdir VFS will take inode_lock for readdir, therefore no need to take page lock in readdir at all just as the majority of other generic filesystems. This patch improves concurrency since .iterate_shared was introduced to VFS years ago. Signed-off-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 0e0e0880c4de..dd06815048f4 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -896,7 +896,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - dentry_page = f2fs_get_lock_data_page(inode, n, false); + dentry_page = f2fs_find_data_page(inode, n); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); if (err == -ENOENT) { @@ -914,11 +914,11 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); if (err) { - f2fs_put_page(dentry_page, 1); + f2fs_put_page(dentry_page, 0); break; } - f2fs_put_page(dentry_page, 1); + f2fs_put_page(dentry_page, 0); } out_free: fscrypt_fname_free_buffer(&fstr); From fb28b034f8aa3e92b0ac303ad965397704a1d7e5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 25 Feb 2019 09:46:45 -0800 Subject: [PATCH 042/134] f2fs: give random value to i_generation This follows to give random number to i_generation along with commit 232530680290b ("ext4: improve smp scalability for inode generation") This can be used for DUN for UFS HW encryption. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 -- fs/f2fs/namei.c | 3 ++- fs/f2fs/super.c | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 26c28a78f38b..26afa6b31f96 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1243,8 +1243,6 @@ struct f2fs_sb_info { unsigned int nquota_files; /* # of quota sysfile */ - u32 s_next_generation; /* for NFS support */ - /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 826e708a0ed0..c97f1665d430 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -50,7 +51,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = F2FS_I(inode)->i_crtime = current_time(inode); - inode->i_generation = sbi->s_next_generation++; + inode->i_generation = prandom_u32(); if (S_ISDIR(inode->i_mode)) F2FS_I(inode)->i_current_depth = 1; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2dc279daf7f2..f1baa20bb974 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3129,7 +3129,6 @@ try_onemore: sb->s_maxbytes = sbi->max_file_blocks << le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; - get_random_bytes(&sbi->s_next_generation, sizeof(u32)); #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; From 2568f36fa988a796b8786bb7ac16413380dc050e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 23 Feb 2019 09:48:27 +0800 Subject: [PATCH 043/134] f2fs: fix to dirty inode for i_mode recovery As Seulbae Kim reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202637 We didn't recover permission field correctly after sudden power-cut, the reason is in setattr we didn't add inode into global dirty list once i_mode is changed, so latter checkpoint triggered by fsync will not flush last i_mode into disk, result in this problem, fix it. Reported-by: Seulbae Kim Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5df1253c2010..c6085c966e6b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -766,7 +766,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; - bool size_changed = false; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; @@ -841,8 +840,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) down_write(&F2FS_I(inode)->i_sem); F2FS_I(inode)->last_disk_size = i_size_read(inode); up_write(&F2FS_I(inode)->i_sem); - - size_changed = true; } __setattr_copy(inode, attr); @@ -856,7 +853,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } /* file size may changed here */ - f2fs_mark_inode_dirty_sync(inode, size_changed); + f2fs_mark_inode_dirty_sync(inode, true); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true); From 23feb6194dc9cd1359f79ee369eb7d927ade5672 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 25 Feb 2019 17:11:03 +0800 Subject: [PATCH 044/134] f2fs: fix to avoid deadlock of atomic file operations Thread A Thread B - __fput - f2fs_release_file - drop_inmem_pages - mutex_lock(&fi->inmem_lock) - __revoke_inmem_pages - lock_page(page) - open - f2fs_setattr - truncate_setsize - truncate_inode_pages_range - lock_page(page) - truncate_cleanup_page - f2fs_invalidate_page - drop_inmem_page - mutex_lock(&fi->inmem_lock); We may encounter above ABBA deadlock as reported by Kyungtae Kim: I'm reporting a bug in linux-4.17.19: "INFO: task hung in drop_inmem_page" (no reproducer) I think this might be somehow related to the following: https://groups.google.com/forum/#!searchin/syzkaller-bugs/INFO$3A$20task$20hung$20in$20%7Csort:date/syzkaller-bugs/c6soBTrdaIo/AjAzPeIzCgAJ ========================================= INFO: task syz-executor7:10822 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor7 D27024 10822 6346 0x00000004 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 schedule_preempt_disabled+0x18/0x30 kernel/sched/core.c:3617 __mutex_lock_common kernel/locking/mutex.c:833 [inline] __mutex_lock+0x5bd/0x1410 kernel/locking/mutex.c:893 mutex_lock_nested+0x1b/0x20 kernel/locking/mutex.c:908 drop_inmem_page+0xcb/0x810 fs/f2fs/segment.c:327 f2fs_invalidate_page+0x337/0x5e0 fs/f2fs/data.c:2401 do_invalidatepage mm/truncate.c:165 [inline] truncate_cleanup_page+0x261/0x330 mm/truncate.c:187 truncate_inode_pages_range+0x552/0x1610 mm/truncate.c:367 truncate_inode_pages mm/truncate.c:478 [inline] truncate_pagecache+0x6d/0x90 mm/truncate.c:801 truncate_setsize+0x81/0xa0 mm/truncate.c:826 f2fs_setattr+0x44f/0x1270 fs/f2fs/file.c:781 notify_change+0xa62/0xe80 fs/attr.c:313 do_truncate+0x12e/0x1e0 fs/open.c:63 do_last fs/namei.c:2955 [inline] path_openat+0x2042/0x29f0 fs/namei.c:3505 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540 do_sys_open+0x35e/0x4e0 fs/open.c:1101 __do_sys_open fs/open.c:1119 [inline] __se_sys_open fs/open.c:1114 [inline] __x64_sys_open+0x89/0xc0 fs/open.c:1114 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f734e459c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002 RAX: ffffffffffffffda RBX: 00007f734e45a6cc RCX: 00000000004497b9 RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080 RBP: 000000000071bea0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e45a700 INFO: task syz-executor7:10858 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor7 D28880 10858 6346 0x00000004 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 __rwsem_down_write_failed_common kernel/locking/rwsem-xadd.c:565 [inline] rwsem_down_write_failed+0x5e6/0xc90 kernel/locking/rwsem-xadd.c:594 call_rwsem_down_write_failed+0x17/0x30 arch/x86/lib/rwsem.S:117 __down_write arch/x86/include/asm/rwsem.h:142 [inline] down_write+0x58/0xa0 kernel/locking/rwsem.c:72 inode_lock include/linux/fs.h:713 [inline] do_truncate+0x120/0x1e0 fs/open.c:61 do_last fs/namei.c:2955 [inline] path_openat+0x2042/0x29f0 fs/namei.c:3505 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540 do_sys_open+0x35e/0x4e0 fs/open.c:1101 __do_sys_open fs/open.c:1119 [inline] __se_sys_open fs/open.c:1114 [inline] __x64_sys_open+0x89/0xc0 fs/open.c:1114 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f734e3b4c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002 RAX: ffffffffffffffda RBX: 00007f734e3b56cc RCX: 00000000004497b9 RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080 RBP: 000000000071c238 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e3b5700 INFO: task syz-executor5:10829 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor5 D28760 10829 6308 0x80000002 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 io_schedule+0x21/0x80 kernel/sched/core.c:5179 wait_on_page_bit_common mm/filemap.c:1100 [inline] __lock_page+0x2b5/0x390 mm/filemap.c:1273 lock_page include/linux/pagemap.h:483 [inline] __revoke_inmem_pages+0xb35/0x11c0 fs/f2fs/segment.c:231 drop_inmem_pages+0xa3/0x3e0 fs/f2fs/segment.c:306 f2fs_release_file+0x2c7/0x330 fs/f2fs/file.c:1556 __fput+0x2c7/0x780 fs/file_table.c:209 ____fput+0x1a/0x20 fs/file_table.c:243 task_work_run+0x151/0x1d0 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x8ba/0x30a0 kernel/exit.c:865 do_group_exit+0x13b/0x3a0 kernel/exit.c:968 get_signal+0x6bb/0x1650 kernel/signal.c:2482 do_signal+0x84/0x1b70 arch/x86/kernel/signal.c:810 exit_to_usermode_loop+0x155/0x190 arch/x86/entry/common.c:162 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline] syscall_return_slowpath arch/x86/entry/common.c:265 [inline] do_syscall_64+0x445/0x4e0 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f1c68e74ce8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: fffffffffffffe00 RBX: 000000000071bf80 RCX: 00000000004497b9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000071bf80 RBP: 000000000071bf80 R08: 0000000000000000 R09: 000000000071bf58 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007f1c68e759c0 R15: 00007f1c68e75700 This patch tries to use trylock_page to mitigate such deadlock condition for fix. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d74359325da6..e730c334abba 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -215,7 +215,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) } static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover) + struct list_head *head, bool drop, bool recover, + bool trylock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inmem_pages *cur, *tmp; @@ -227,7 +228,16 @@ static int __revoke_inmem_pages(struct inode *inode, if (drop) trace_f2fs_commit_inmem_page(page, INMEM_DROP); - lock_page(page); + if (trylock) { + /* + * to avoid deadlock in between page lock and + * inmem_lock. + */ + if (!trylock_page(page)) + continue; + } else { + lock_page(page); + } f2fs_wait_on_page_writeback(page, DATA, true, true); @@ -318,13 +328,19 @@ void f2fs_drop_inmem_pages(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - mutex_lock(&fi->inmem_lock); - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - mutex_unlock(&fi->inmem_lock); + while (!list_empty(&fi->inmem_pages)) { + mutex_lock(&fi->inmem_lock); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, true); + + if (list_empty(&fi->inmem_pages)) { + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } + mutex_unlock(&fi->inmem_lock); + } clear_inode_flag(inode, FI_ATOMIC_FILE); fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; @@ -429,12 +445,15 @@ retry: * recovery or rewrite & commit last transaction. For other * error number, revoking was done by filesystem itself. */ - err = __revoke_inmem_pages(inode, &revoke_list, false, true); + err = __revoke_inmem_pages(inode, &revoke_list, + false, true, false); /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, false); } else { - __revoke_inmem_pages(inode, &revoke_list, false, false); + __revoke_inmem_pages(inode, &revoke_list, + false, false, false); } return err; From 9bc3cabe1acb4522ac2084533c140c788c9904eb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 26 Feb 2019 19:01:15 +0800 Subject: [PATCH 045/134] f2fs: trace f2fs_ioc_shutdown This patch supports to trace f2fs_ioc_shutdown. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ include/trace/events/f2fs.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c6085c966e6b..1b9106523f27 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1987,6 +1987,9 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) out: if (in != F2FS_GOING_DOWN_FULLSYNC) mnt_drop_write_file(filp); + + trace_f2fs_shutdown(sbi, in, ret); + return ret; } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index c842bc70c68c..4bf5c7d35d52 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -150,6 +150,14 @@ TRACE_DEFINE_ENUM(CP_TRIMMED); { CP_SPEC_LOG_NUM, "log type is 2" }, \ { CP_RECOVER_DIR, "dir needs recovery" }) +#define show_shutdown_mode(type) \ + __print_symbolic(type, \ + { F2FS_GOING_DOWN_FULLSYNC, "full sync" }, \ + { F2FS_GOING_DOWN_METASYNC, "meta sync" }, \ + { F2FS_GOING_DOWN_NOSYNC, "no sync" }, \ + { F2FS_GOING_DOWN_METAFLUSH, "meta flush" }, \ + { F2FS_GOING_DOWN_NEED_FSCK, "need fsck" }) + struct f2fs_sb_info; struct f2fs_io_info; struct extent_info; @@ -1620,6 +1628,30 @@ DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_exit, TP_ARGS(sb, type, count) ); +TRACE_EVENT(f2fs_shutdown, + + TP_PROTO(struct f2fs_sb_info *sbi, unsigned int mode, int ret), + + TP_ARGS(sbi, mode, ret), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, mode) + __field(int, ret) + ), + + TP_fast_assign( + __entry->dev = sbi->sb->s_dev; + __entry->mode = mode; + __entry->ret = ret; + ), + + TP_printk("dev = (%d,%d), mode: %s, ret:%d", + show_dev(__entry->dev), + show_shutdown_mode(__entry->mode), + __entry->ret) +); + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ From 2ef9ad7b9b133bac997e3c2e8fea5433c23dc845 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 26 Feb 2019 19:01:16 +0800 Subject: [PATCH 046/134] f2fs: print more parameters in trace_f2fs_map_blocks for better map_blocks trace. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/trace/events/f2fs.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 4bf5c7d35d52..2ce6af3235fb 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -545,6 +545,9 @@ TRACE_EVENT(f2fs_map_blocks, __field(block_t, m_lblk) __field(block_t, m_pblk) __field(unsigned int, m_len) + __field(unsigned int, m_flags) + __field(int, m_seg_type) + __field(bool, m_may_create) __field(int, ret) ), @@ -554,15 +557,22 @@ TRACE_EVENT(f2fs_map_blocks, __entry->m_lblk = map->m_lblk; __entry->m_pblk = map->m_pblk; __entry->m_len = map->m_len; + __entry->m_flags = map->m_flags; + __entry->m_seg_type = map->m_seg_type; + __entry->m_may_create = map->m_may_create; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, file offset = %llu, " - "start blkaddr = 0x%llx, len = 0x%llx, err = %d", + "start blkaddr = 0x%llx, len = 0x%llx, flags = %u," + "seg_type = %d, may_create = %d, err = %d", show_dev_ino(__entry), (unsigned long long)__entry->m_lblk, (unsigned long long)__entry->m_pblk, (unsigned long long)__entry->m_len, + __entry->m_flags, + __entry->m_seg_type, + __entry->m_may_create, __entry->ret) ); From d8c29f1e8c3162ce9d585c4b4b55d78ed413e1ef Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 5 Mar 2019 17:52:33 +0800 Subject: [PATCH 047/134] f2fs: fix to use kvfree instead of kzfree As Jiqun Li reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202747 System can panic due to using wrong allocate/free function pair in xattr interface: - use kvmalloc to allocate memory - use kzfree to free memory Let's fix to use kvfree instead of kzfree, BTW, we are safe to get rid of kzfree, since there is no such confidential data stored as xattr, we don't need to zero it before free memory. Fixes: 5222595d093e ("f2fs: use kvmalloc, if kmalloc is failed") Reported-by: Jiqun Li Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index fa620d31ea5f..ca47224d78ab 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -340,7 +340,7 @@ check: *base_addr = txattr_addr; return 0; out: - kzfree(txattr_addr); + kvfree(txattr_addr); return err; } @@ -383,7 +383,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, *base_addr = txattr_addr; return 0; fail: - kzfree(txattr_addr); + kvfree(txattr_addr); return err; } @@ -510,7 +510,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, } error = size; out: - kzfree(base_addr); + kvfree(base_addr); return error; } @@ -556,7 +556,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) } error = buffer_size - rest; cleanup: - kzfree(base_addr); + kvfree(base_addr); return error; } @@ -687,7 +687,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: - kzfree(base_addr); + kvfree(base_addr); return error; } From 3031d67522f4eb9c0b9feaefee1635f896e2413e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 6 Mar 2019 16:18:33 +0800 Subject: [PATCH 048/134] f2fs: remove wrong comment in f2fs_invalidate_page() Since 8c242db9b8c0 ("f2fs: fix stale ATOMIC_WRITTEN_PAGE private pointer"), we've started to not skip clear private flag for atomic_write page truncation, so removing old wrong comment in f2fs_invalidate_page(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 91dad12942b4..f16bc212088f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2710,7 +2710,6 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, clear_cold_data(page); - /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) return f2fs_drop_inmem_page(inode, page); From ac9805cff1fcac21889af7348aae793664225109 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 6 Mar 2019 17:30:59 +0800 Subject: [PATCH 049/134] f2fs: fix to add refcount once page is tagged PG_private As Gao Xiang reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202749 f2fs may skip pageout() due to incorrect page reference count. The problem here is that MM defined the rule [1] very clearly that once page was set with PG_private flag, we should increment the refcount in that page, also main flows like pageout(), migrate_page() will assume there is one additional page reference count if page_has_private() returns true. But currently, f2fs won't add/del refcount when changing PG_private flag. Anyway, f2fs should follow MM's rule to make MM's related flows running as expected. [1] https://lore.kernel.org/lkml/2b19b3c4-2bc4-15fa-15cc-27a13e5c7af1@aol.com/ Reported-by: Gao Xiang Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- fs/f2fs/data.c | 21 ++++++++------------- fs/f2fs/dir.c | 2 +- fs/f2fs/f2fs.h | 21 +++++++++++++++++++++ fs/f2fs/node.c | 2 +- fs/f2fs/segment.c | 9 +++------ 6 files changed, 36 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 62a8770cb19b..5029480e69f7 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -406,7 +406,7 @@ static int f2fs_set_meta_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); return 1; } @@ -957,7 +957,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page) inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f16bc212088f..9107411a6dab 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2713,8 +2713,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, if (IS_ATOMIC_WRITTEN_PAGE(page)) return f2fs_drop_inmem_page(inode, page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); } int f2fs_release_page(struct page *page, gfp_t wait) @@ -2728,8 +2727,7 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 0; clear_cold_data(page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); return 1; } @@ -2797,12 +2795,8 @@ int f2fs_migrate_page(struct address_space *mapping, return -EAGAIN; } - /* - * A reference is expected if PagePrivate set when move mapping, - * however F2FS breaks this for maintaining dirty page counts when - * truncating pages. So here adjusting the 'extra_count' make it work. - */ - extra_count = (atomic_written ? 1 : 0) - page_has_private(page); + /* one extra reference was held for atomic_write page */ + extra_count = atomic_written ? 1 : 0; rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, extra_count); if (rc != MIGRATEPAGE_SUCCESS) { @@ -2823,9 +2817,10 @@ int f2fs_migrate_page(struct address_space *mapping, get_page(newpage); } - if (PagePrivate(page)) - SetPagePrivate(newpage); - set_page_private(newpage, page_private(page)); + if (PagePrivate(page)) { + f2fs_set_page_private(newpage, page_private(page)); + f2fs_clear_page_private(page); + } if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index dd06815048f4..d3eafe9e2ed2 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -728,7 +728,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, !f2fs_truncate_hole(dir, page->index, page->index + 1)) { f2fs_clear_radix_tree_dirty_tag(page); clear_page_dirty_for_io(page); - ClearPagePrivate(page); + f2fs_clear_page_private(page); ClearPageUptodate(page); clear_cold_data(page); inode_dec_dirty_pages(dir); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 26afa6b31f96..2327059af4a8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2836,6 +2836,27 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi, return true; } +static inline void f2fs_set_page_private(struct page *page, + unsigned long data) +{ + if (PagePrivate(page)) + return; + + get_page(page); + SetPagePrivate(page); + set_page_private(page, data); +} + +static inline void f2fs_clear_page_private(struct page *page) +{ + if (!PagePrivate(page)) + return; + + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 0); +} + /* * file.c */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e39ca804df6f..93e9d422c99a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1963,7 +1963,7 @@ static int f2fs_set_node_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); return 1; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e730c334abba..aa7fe79b62b2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -191,8 +191,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) f2fs_trace_pid(page); - set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); - SetPagePrivate(page); + f2fs_set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); @@ -280,8 +279,7 @@ next: ClearPageUptodate(page); clear_cold_data(page); } - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); f2fs_put_page(page, 1); list_del(&cur->list); @@ -370,8 +368,7 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page) kmem_cache_free(inmem_entry_slab, cur); ClearPageUptodate(page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); f2fs_put_page(page, 0); trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); From f36230ae4a730301e67c631a972f4a1f13824f66 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 7 Mar 2019 17:31:30 +0800 Subject: [PATCH 050/134] f2fs: don't trigger read IO for beyond EOF page In f2fs_mpage_readpages(), if page is beyond EOF, we should just zero out it, but previously, before checking previous mapping info, we missed to check filesize boundary, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9107411a6dab..ec503a11e534 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1550,6 +1550,9 @@ static int f2fs_mpage_readpages(struct address_space *mapping, if (last_block > last_block_in_file) last_block = last_block_in_file; + /* just zeroing out page which is beyond EOF */ + if (block_in_file >= last_block) + goto zero_out; /* * Map blocks using the previous result first. */ @@ -1562,16 +1565,11 @@ static int f2fs_mpage_readpages(struct address_space *mapping, * Then do more f2fs_map_blocks() calls until we are * done with this page. */ - map.m_flags = 0; + map.m_lblk = block_in_file; + map.m_len = last_block - block_in_file; - if (block_in_file < last_block) { - map.m_lblk = block_in_file; - map.m_len = last_block - block_in_file; - - if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_DEFAULT)) - goto set_error_page; - } + if (f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT)) + goto set_error_page; got_it: if ((map.m_flags & F2FS_MAP_MAPPED)) { block_nr = map.m_pblk + block_in_file - map.m_lblk; @@ -1586,6 +1584,7 @@ got_it: DATA_GENERIC)) goto set_error_page; } else { +zero_out: zero_user_segment(page, 0, PAGE_SIZE); if (!PageUptodate(page)) SetPageUptodate(page); From d895b644336cde3f22e885cd3fb0be85bb7aad0b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 12 Mar 2019 11:49:53 -0700 Subject: [PATCH 051/134] f2fs: give some messages for inline_xattr_size This patch adds some kernel messages when user sets wrong inline_xattr_size. Fixes: 500e0b28ecd3 ("f2fs: fix to check inline_xattr_size boundary correctly") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f1baa20bb974..bb75398d9f99 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -821,6 +821,8 @@ static int parse_options(struct super_block *sb, char *options) } if (test_opt(sbi, INLINE_XATTR_SIZE)) { + int min_size, max_size; + if (!f2fs_sb_has_extra_attr(sbi) || !f2fs_sb_has_flexible_inline_xattr(sbi)) { f2fs_msg(sb, KERN_ERR, @@ -834,15 +836,18 @@ static int parse_options(struct super_block *sb, char *options) "set with inline_xattr option"); return -EINVAL; } - if (F2FS_OPTION(sbi).inline_xattr_size < - sizeof(struct f2fs_xattr_header) / sizeof(__le32) || - F2FS_OPTION(sbi).inline_xattr_size > - DEF_ADDRS_PER_INODE - + + min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32); + max_size = DEF_ADDRS_PER_INODE - F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - DEF_INLINE_RESERVED_SIZE - - MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) { + MIN_INLINE_DENTRY_SIZE / sizeof(__le32); + + if (F2FS_OPTION(sbi).inline_xattr_size < min_size || + F2FS_OPTION(sbi).inline_xattr_size > max_size) { f2fs_msg(sb, KERN_ERR, - "inline xattr size is out of range"); + "inline xattr size is out of range: %d ~ %d", + min_size, max_size); return -EINVAL; } } From 4ff98f21c14cde80af119e81c7bacd18ac34db34 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Mar 2019 17:19:04 +0800 Subject: [PATCH 052/134] f2fs: fix to do sanity check with inode.i_inline_xattr_size As Paul Bandha reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202709 When I run the poc on the mounted f2fs img I get a buffer overflow in read_inline_xattr due to there being no sanity check on the value of i_inline_xattr_size. I created the img by just modifying the value of i_inline_xattr_size in the inode: i_name [test1.txt] i_ext: fofs:0 blkaddr:0 len:0 i_extra_isize [0x 18 : 24] i_inline_xattr_size [0x ffff : 65535] i_addr[ofs] [0x 0 : 0] mkdir /mnt/f2fs mount ./f2fs1.img /mnt/f2fs gcc poc.c -o poc ./poc int main() { int y = syscall(SYS_listxattr, "/mnt/f2fs/test1.txt", NULL, 0); printf("ret %d", y); printf("errno: %d\n", errno); } BUG: KASAN: slab-out-of-bounds in read_inline_xattr+0x18f/0x260 Read of size 262140 at addr ffff88011035efd8 by task f2fs1poc/3263 CPU: 0 PID: 3263 Comm: f2fs1poc Not tainted 4.18.0-custom #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.1-0-g0551a4be2c-prebuilt.qemu-project.org 04/01/2014 Call Trace: dump_stack+0x71/0xab print_address_description+0x83/0x250 kasan_report+0x213/0x350 memcpy+0x1f/0x50 read_inline_xattr+0x18f/0x260 read_all_xattrs+0xba/0x190 f2fs_listxattr+0x9d/0x3f0 listxattr+0xb2/0xd0 path_listxattr+0x93/0xe0 do_syscall_64+0x9d/0x220 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Let's add sanity check for inode.i_inline_xattr_size during f2fs_iget() to avoid this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 15 +++++++++++++++ fs/f2fs/super.c | 5 +---- fs/f2fs/xattr.h | 6 ++++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bec52961630b..d44e6de70cb3 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -14,6 +14,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "xattr.h" #include @@ -248,6 +249,20 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if (f2fs_has_extra_attr(inode) && + f2fs_sb_has_flexible_inline_xattr(sbi) && + f2fs_has_inline_xattr(inode) && + (!fi->i_inline_xattr_size || + fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) has corrupted " + "i_inline_xattr_size: %d, max: %zu", + __func__, inode->i_ino, fi->i_inline_xattr_size, + MAX_INLINE_XATTR_SIZE); + return false; + } + if (F2FS_I(inode)->extent_tree) { struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bb75398d9f99..27d36b08f67b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -838,10 +838,7 @@ static int parse_options(struct super_block *sb, char *options) } min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32); - max_size = DEF_ADDRS_PER_INODE - - F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - - DEF_INLINE_RESERVED_SIZE - - MIN_INLINE_DENTRY_SIZE / sizeof(__le32); + max_size = MAX_INLINE_XATTR_SIZE; if (F2FS_OPTION(sbi).inline_xattr_size < min_size || F2FS_OPTION(sbi).inline_xattr_size > max_size) { diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 67db134da0f5..9172ee082ca8 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -78,6 +78,12 @@ struct f2fs_xattr_entry { sizeof(struct f2fs_xattr_header) - \ sizeof(struct f2fs_xattr_entry)) +#define MAX_INLINE_XATTR_SIZE \ + (DEF_ADDRS_PER_INODE - \ + F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - \ + DEF_INLINE_RESERVED_SIZE - \ + MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) + /* * On-disk structure of f2fs_xattr * We use inline xattrs space + 1 block for xattr. From a23295a806b9a074faab902adf011801dfed1a27 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 5 Mar 2019 19:32:26 +0800 Subject: [PATCH 053/134] f2fs: fix to adapt small inline xattr space in __find_inline_xattr() With below testcase, we will fail to find existed xattr entry: 1. mkfs.f2fs -O extra_attr -O flexible_inline_xattr /dev/zram0 2. mount -t f2fs -o inline_xattr_size=1 /dev/zram0 /mnt/f2fs/ 3. touch /mnt/f2fs/file 4. setfattr -n "user.name" -v 0 /mnt/f2fs/file 5. getfattr -n "user.name" /mnt/f2fs/file /mnt/f2fs/file: user.name: No such attribute The reason is for inode which has very small inline xattr size, __find_inline_xattr() will fail to traverse any entry due to first entry may not be loaded from xattr node yet, later, we may skip to check entire xattr datas in __find_xattr(), result in such wrong condition. This patch adds condition to check such case to avoid this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index ca47224d78ab..848a785abe25 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -224,11 +224,11 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, { struct f2fs_xattr_entry *entry; unsigned int inline_size = inline_xattr_size(inode); + void *max_addr = base_addr + inline_size; list_for_each_xattr(entry, base_addr) { - if ((void *)entry + sizeof(__u32) > base_addr + inline_size || - (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) > - base_addr + inline_size) { + if ((void *)entry + sizeof(__u32) > max_addr || + (void *)XATTR_NEXT_ENTRY(entry) > max_addr) { *last_addr = entry; return NULL; } @@ -239,6 +239,13 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, if (!memcmp(entry->e_name, name, len)) break; } + + /* inline xattr header or entry across max inline xattr size */ + if (IS_XATTR_LAST_ENTRY(entry) && + (void *)entry + sizeof(__u32) > max_addr) { + *last_addr = entry; + return NULL; + } return entry; } From 5a98485eb30cb971d07ac38d09c85c531d6f40b8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Mar 2019 15:44:27 +0800 Subject: [PATCH 054/134] f2fs: fix to avoid deadlock in f2fs_read_inline_dir() As Jiqun Li reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202883 sometimes, dead lock when make system call SYS_getdents64 with fsync() is called by another process. monkey running on android9.0 1. task 9785 held sbi->cp_rwsem and waiting lock_page() 2. task 10349 held mm_sem and waiting sbi->cp_rwsem 3. task 9709 held lock_page() and waiting mm_sem so this is a dead lock scenario. task stack is show by crash tools as following crash_arm64> bt ffffffc03c354080 PID: 9785 TASK: ffffffc03c354080 CPU: 1 COMMAND: "RxIoScheduler-3" >> #7 [ffffffc01b50fac0] __lock_page at ffffff80081b11e8 crash-arm64> bt 10349 PID: 10349 TASK: ffffffc018b83080 CPU: 1 COMMAND: "BUGLY_ASYNC_UPL" >> #3 [ffffffc01f8cfa40] rwsem_down_read_failed at ffffff8008a93afc PC: 00000033 LR: 00000000 SP: 00000000 PSTATE: ffffffffffffffff crash-arm64> bt 9709 PID: 9709 TASK: ffffffc03e7f3080 CPU: 1 COMMAND: "IntentService[A" >> #3 [ffffffc001e67850] rwsem_down_read_failed at ffffff8008a93afc >> #8 [ffffffc001e67b80] el1_ia at ffffff8008084fc4 PC: ffffff8008274114 [compat_filldir64+120] LR: ffffff80083584d4 [f2fs_fill_dentries+448] SP: ffffffc001e67b80 PSTATE: 80400145 X29: ffffffc001e67b80 X28: 0000000000000000 X27: 000000000000001a X26: 00000000000093d7 X25: ffffffc070d52480 X24: 0000000000000008 X23: 0000000000000028 X22: 00000000d43dfd60 X21: ffffffc001e67e90 X20: 0000000000000011 X19: ffffff80093a4000 X18: 0000000000000000 X17: 0000000000000000 X16: 0000000000000000 X15: 0000000000000000 X14: ffffffffffffffff X13: 0000000000000008 X12: 0101010101010101 X11: 7f7f7f7f7f7f7f7f X10: 6a6a6a6a6a6a6a6a X9: 7f7f7f7f7f7f7f7f X8: 0000000080808000 X7: ffffff800827409c X6: 0000000080808000 X5: 0000000000000008 X4: 00000000000093d7 X3: 000000000000001a X2: 0000000000000011 X1: ffffffc070d52480 X0: 0000000000800238 >> #9 [ffffffc001e67be0] f2fs_fill_dentries at ffffff80083584d0 PC: 0000003c LR: 00000000 SP: 00000000 PSTATE: 000000d9 X12: f48a02ff X11: d4678960 X10: d43dfc00 X9: d4678ae4 X8: 00000058 X7: d4678994 X6: d43de800 X5: 000000d9 X4: d43dfc0c X3: d43dfc10 X2: d46799c8 X1: 00000000 X0: 00001068 Below potential deadlock will happen between three threads: Thread A Thread B Thread C - f2fs_do_sync_file - f2fs_write_checkpoint - down_write(&sbi->node_change) -- 1) - do_page_fault - down_write(&mm->mmap_sem) -- 2) - do_wp_page - f2fs_vm_page_mkwrite - getdents64 - f2fs_read_inline_dir - lock_page -- 3) - f2fs_sync_node_pages - lock_page -- 3) - __do_map_lock - down_read(&sbi->node_change) -- 1) - f2fs_fill_dentries - dir_emit - compat_filldir64 - do_page_fault - down_read(&mm->mmap_sem) -- 2) Since f2fs_readdir is protected by inode.i_rwsem, there should not be any updates in inode page, we're safe to lookup dents in inode page without its lock held, so taking off the lock to improve concurrency of readdir and avoid potential deadlock. Reported-by: Jiqun Li Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3ebae34a8b96..c16bb32f42f0 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -659,6 +659,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (IS_ERR(ipage)) return PTR_ERR(ipage); + /* + * f2fs_readdir was protected by inode.i_rwsem, it is safe to access + * ipage without page's lock held. + */ + unlock_page(ipage); + inline_dentry = inline_data_addr(inode, ipage); make_dentry_ptr_inline(inode, &d, inline_dentry); @@ -667,7 +673,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (!err) ctx->pos = d.max; - f2fs_put_page(ipage, 1); + f2fs_put_page(ipage, 0); return err < 0 ? err : 0; } From 4592dca5048f0d6d684d0fa52c8b8c308c46c21f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 13 Mar 2019 16:15:08 -0700 Subject: [PATCH 055/134] f2fs: set pin_file under CAP_SYS_ADMIN Android uses pin_file for uncrypt during OTA, and that should be managed by CAP_SYS_ADMIN only. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1b9106523f27..69db0d1c34a0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2873,8 +2873,8 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) __u32 pin; int ret = 0; - if (!inode_owner_or_capable(inode)) - return -EACCES; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; if (get_user(pin, (__u32 __user *)arg)) return -EFAULT; From 02b2463554590586ca14abe3b87886b893504390 Mon Sep 17 00:00:00 2001 From: Paul Lawrence Date: Tue, 23 Oct 2018 08:56:04 -0700 Subject: [PATCH 056/134] ANDROID: dm-bow: Add dm-bow feature Based on https://www.redhat.com/archives/dm-devel/2019-March/msg00025.html Third version of dm-bow. Key changes: Free list added Support for block sizes other than 4k Handles writes during trim phase, and overlapping trims Integer overflow error Support trims even if underlying device doesn't Numerous small bug fixes bow == backup on write USE CASE: dm-bow takes a snapshot of an existing file system before mounting. The user may, before removing the device, commit the snapshot. Alternatively the user may remove the device and then run a command line utility to restore the device to its original state. dm-bow does not require an external device dm-bow efficiently uses all the available free space on the file system. IMPLEMENTATION: dm-bow can be in one of three states. In state one, the free blocks on the device are identified by issuing an FSTRIM to the filesystem. In state two, any writes cause the overwritten data to be backup up to the available free space. While in this state, the device can be restored by unmounting the filesystem, removing the dm-bow device and running a usermode tool over the underlying device. In state three, the changes are committed, dm-bow is in pass-through mode and the drive can no longer be restored. It is planned to use this driver to enable restoration of a failed update attempt on Android devices using ext4. Test: Can boot Android with userdata mounted on this device. Can commit userdata after SUW has run. Can then reboot, make changes and roll back. Known issues: Mutex is held around entire flush operation, including lengthy I/O. Plan is to convert to state machine with pending queues. Interaction with block encryption is unknown, especially with respect to sector 0. Bug: 119769411 Change-Id: Id70988bbd797ebe3e76fc175094388b423c8da8c Signed-off-by: Paul Lawrence --- Documentation/device-mapper/dm-bow.txt | 99 ++ drivers/md/Kconfig | 13 + drivers/md/Makefile | 1 + drivers/md/dm-bow.c | 1226 ++++++++++++++++++++++++ 4 files changed, 1339 insertions(+) create mode 100644 Documentation/device-mapper/dm-bow.txt create mode 100644 drivers/md/dm-bow.c diff --git a/Documentation/device-mapper/dm-bow.txt b/Documentation/device-mapper/dm-bow.txt new file mode 100644 index 000000000000..e3fc4d22e0f4 --- /dev/null +++ b/Documentation/device-mapper/dm-bow.txt @@ -0,0 +1,99 @@ +dm_bow (backup on write) +======================== + +dm_bow is a device mapper driver that uses the free space on a device to back up +data that is overwritten. The changes can then be committed by a simple state +change, or rolled back by removing the dm_bow device and running a command line +utility over the underlying device. + +dm_bow has three states, set by writing ‘1’ or ‘2’ to /sys/block/dm-?/bow/state. +It is only possible to go from state 0 (initial state) to state 1, and then from +state 1 to state 2. + +State 0: dm_bow collects all trims to the device and assumes that these mark +free space on the overlying file system that can be safely used. Typically the +mount code would create the dm_bow device, mount the file system, call the +FITRIM ioctl on the file system then switch to state 1. These trims are not +propagated to the underlying device. + +State 1: All writes to the device cause the underlying data to be backed up to +the free (trimmed) area as needed in such a way as they can be restored. +However, the writes, with one exception, then happen exactly as they would +without dm_bow, so the device is always in a good final state. The exception is +that sector 0 is used to keep a log of the latest changes, both to indicate that +we are in this state and to allow rollback. See below for all details. If there +isn't enough free space, writes are failed with -ENOSPC. + +State 2: The transition to state 2 triggers replacing the special sector 0 with +the normal sector 0, and the freeing of all state information. dm_bow then +becomes a pass-through driver, allowing the device to continue to be used with +minimal performance impact. + +Usage +===== +dm-bow takes one command line parameter, the name of the underlying device. + +dm-bow will typically be used in the following way. dm-bow will be loaded with a +suitable underlying device and the resultant device will be mounted. A file +system trim will be issued via the FITRIM ioctl, then the device will be +switched to state 1. The file system will now be used as normal. At some point, +the changes can either be committed by switching to state 2, or rolled back by +unmounting the file system, removing the dm-bow device and running the command +line utility. Note that rebooting the device will be equivalent to unmounting +and removing, but the command line utility must still be run + +Details of operation in state 1 +=============================== + +dm_bow maintains a type for all sectors. A sector can be any of: + +SECTOR0 +SECTOR0_CURRENT +UNCHANGED +FREE +CHANGED +BACKUP + +SECTOR0 is the first sector on the device, and is used to hold the log of +changes. This is the one exception. + +SECTOR0_CURRENT is a sector picked from the FREE sectors, and is where reads and +writes from the true sector zero are redirected to. Note that like any backup +sector, if the sector is written to directly, it must be moved again. + +UNCHANGED means that the sector has not been changed since we entered state 1. +Thus if it is written to or trimmed, the contents must first be backed up. + +FREE means that the sector was trimmed in state 0 and has not yet been written +to or used for backup. On being written to, a FREE sector is changed to CHANGED. + +CHANGED means that the sector has been modified, and can be further modified +without further backup. + +BACKUP means that this is a free sector being used as a backup. On being written +to, the contents must first be backed up again. + +All backup operations are logged to the first sector. The log sector has the +format: +-------------------------------------------------------- +| Magic | Count | Sequence | Log entry | Log entry | … +-------------------------------------------------------- + +Magic is a magic number. Count is the number of log entries. Sequence is 0 +initially. A log entry is + +----------------------------------- +| Source | Dest | Size | Checksum | +----------------------------------- + +When SECTOR0 is full, the log sector is backed up and another empty log sector +created with sequence number one higher. The first entry in any log entry with +sequence > 0 therefore must be the log of the backing up of the previous log +sector. Note that sequence is not strictly needed, but is a useful sanity check +and potentially limits the time spent trying to restore a corrupted snapshot. + +On entering state 1, dm_bow has a list of free sectors. All other sectors are +unchanged. Sector0_current is selected from the free sectors and the contents of +sector 0 are copied there. The sector 0 is backed up, which triggers the first +log entry to be written. + diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index e458bd8519e8..0e643556bb44 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -586,4 +586,17 @@ config DM_ANDROID_VERITY_AT_MOST_ONCE_DEFAULT_ENABLED any more after all the data blocks it covers have been verified anyway. If unsure, say N. + +config DM_BOW + tristate "Backup block device" + depends on BLK_DEV_DM + select DM_BUFIO + ---help--- + This device-mapper target takes a device and keeps a log of all + changes using free blocks identified by issuing a trim command. + This can then be restored by running a command line utility, + or committed by simply replacing the target. + + If unsure, say N. + endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index f98717ffff9e..214be71181f4 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_DM_ERA) += dm-era.o obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o obj-$(CONFIG_DM_ZONED) += dm-zoned.o +obj-$(CONFIG_DM_BOW) += dm-bow.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o diff --git a/drivers/md/dm-bow.c b/drivers/md/dm-bow.c new file mode 100644 index 000000000000..0d176aa140df --- /dev/null +++ b/drivers/md/dm-bow.c @@ -0,0 +1,1226 @@ +/* + * Copyright (C) 2018 Google Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-core.h" + +#include +#include +#include + +#define DM_MSG_PREFIX "bow" + +struct log_entry { + u64 source; + u64 dest; + u32 size; + u32 checksum; +} __packed; + +struct log_sector { + u32 magic; + u16 header_version; + u16 header_size; + u32 block_size; + u32 count; + u32 sequence; + sector_t sector0; + struct log_entry entries[]; +} __packed; + +/* + * MAGIC is BOW in ascii + */ +#define MAGIC 0x00574f42 +#define HEADER_VERSION 0x0100 + +/* + * A sorted set of ranges representing the state of the data on the device. + * Use an rb_tree for fast lookup of a given sector + * Consecutive ranges are always of different type - operations on this + * set must merge matching consecutive ranges. + * + * Top range is always of type TOP + */ +struct bow_range { + struct rb_node node; + sector_t sector; + enum { + INVALID, /* Type not set */ + SECTOR0, /* First sector - holds log record */ + SECTOR0_CURRENT,/* Live contents of sector0 */ + UNCHANGED, /* Original contents */ + TRIMMED, /* Range has been trimmed */ + CHANGED, /* Range has been changed */ + BACKUP, /* Range is being used as a backup */ + TOP, /* Final range - sector is size of device */ + } type; + struct list_head trimmed_list; /* list of TRIMMED ranges */ +}; + +static const char * const readable_type[] = { + "Invalid", + "Sector0", + "Sector0_current", + "Unchanged", + "Free", + "Changed", + "Backup", + "Top", +}; + +enum state { + TRIM, + CHECKPOINT, + COMMITTED, +}; + +struct bow_context { + struct dm_dev *dev; + u32 block_size; + u32 block_shift; + struct workqueue_struct *workqueue; + struct dm_bufio_client *bufio; + struct mutex ranges_lock; /* Hold to access this struct and/or ranges */ + struct rb_root ranges; + struct dm_kobject_holder kobj_holder; /* for sysfs attributes */ + atomic_t state; /* One of the enum state values above */ + u64 trims_total; + struct log_sector *log_sector; + struct list_head trimmed_list; + bool forward_trims; +}; + +sector_t range_top(struct bow_range *br) +{ + return container_of(rb_next(&br->node), struct bow_range, node) + ->sector; +} + +u64 range_size(struct bow_range *br) +{ + return (range_top(br) - br->sector) * SECTOR_SIZE; +} + +static sector_t bvec_top(struct bvec_iter *bi_iter) +{ + return bi_iter->bi_sector + bi_iter->bi_size / SECTOR_SIZE; +} + +/* + * Find the first range that overlaps with bi_iter + * bi_iter is set to the size of the overlapping sub-range + */ +static struct bow_range *find_first_overlapping_range(struct rb_root *ranges, + struct bvec_iter *bi_iter) +{ + struct rb_node *node = ranges->rb_node; + struct bow_range *br; + + while (node) { + br = container_of(node, struct bow_range, node); + + if (br->sector <= bi_iter->bi_sector + && bi_iter->bi_sector < range_top(br)) + break; + + if (bi_iter->bi_sector < br->sector) + node = node->rb_left; + else + node = node->rb_right; + } + + WARN_ON(!node); + if (!node) + return NULL; + + if (range_top(br) - bi_iter->bi_sector + < bi_iter->bi_size >> SECTOR_SHIFT) + bi_iter->bi_size = (range_top(br) - bi_iter->bi_sector) + << SECTOR_SHIFT; + + return br; +} + +void add_before(struct rb_root *ranges, struct bow_range *new_br, + struct bow_range *existing) +{ + struct rb_node *parent = &(existing->node); + struct rb_node **link = &(parent->rb_left); + + while (*link) { + parent = *link; + link = &((*link)->rb_right); + } + + rb_link_node(&new_br->node, parent, link); + rb_insert_color(&new_br->node, ranges); +} + +/* + * Given a range br returned by find_first_overlapping_range, split br into a + * leading range, a range matching the bi_iter and a trailing range. + * Leading and trailing may end up size 0 and will then be deleted. The + * new range matching the bi_iter is then returned and should have its type + * and type specific fields populated. + * If bi_iter runs off the end of the range, bi_iter is truncated accordingly + */ +static int split_range(struct bow_context *bc, struct bow_range **br, + struct bvec_iter *bi_iter) +{ + struct bow_range *new_br; + + if (bi_iter->bi_sector < (*br)->sector) { + WARN_ON(true); + return BLK_STS_IOERR; + } + + if (bi_iter->bi_sector > (*br)->sector) { + struct bow_range *leading_br = + kzalloc(sizeof(*leading_br), GFP_KERNEL); + + if (!leading_br) + return BLK_STS_RESOURCE; + + *leading_br = **br; + if (leading_br->type == TRIMMED) + list_add(&leading_br->trimmed_list, &bc->trimmed_list); + + add_before(&bc->ranges, leading_br, *br); + (*br)->sector = bi_iter->bi_sector; + } + + if (bvec_top(bi_iter) >= range_top(*br)) { + bi_iter->bi_size = (range_top(*br) - (*br)->sector) + * SECTOR_SIZE; + return BLK_STS_OK; + } + + /* new_br will be the beginning, existing br will be the tail */ + new_br = kzalloc(sizeof(*new_br), GFP_KERNEL); + if (!new_br) + return BLK_STS_RESOURCE; + + new_br->sector = (*br)->sector; + (*br)->sector = bvec_top(bi_iter); + add_before(&bc->ranges, new_br, *br); + *br = new_br; + + return BLK_STS_OK; +} + +/* + * Sets type of a range. May merge range into surrounding ranges + * Since br may be invalidated, always sets br to NULL to prevent + * usage after this is called + */ +static void set_type(struct bow_context *bc, struct bow_range **br, int type) +{ + struct bow_range *prev = container_of(rb_prev(&(*br)->node), + struct bow_range, node); + struct bow_range *next = container_of(rb_next(&(*br)->node), + struct bow_range, node); + + if ((*br)->type == TRIMMED) { + bc->trims_total -= range_size(*br); + list_del(&(*br)->trimmed_list); + } + + if (type == TRIMMED) { + bc->trims_total += range_size(*br); + list_add(&(*br)->trimmed_list, &bc->trimmed_list); + } + + (*br)->type = type; + + if (next->type == type) { + if (type == TRIMMED) + list_del(&next->trimmed_list); + rb_erase(&next->node, &bc->ranges); + kfree(next); + } + + if (prev->type == type) { + if (type == TRIMMED) + list_del(&(*br)->trimmed_list); + rb_erase(&(*br)->node, &bc->ranges); + kfree(*br); + } + + *br = NULL; +} + +static struct bow_range *find_free_range(struct bow_context *bc) +{ + if (list_empty(&bc->trimmed_list)) { + DMERR("Unable to find free space to back up to"); + return NULL; + } + + return list_first_entry(&bc->trimmed_list, struct bow_range, + trimmed_list); +} + +static sector_t sector_to_page(struct bow_context const *bc, sector_t sector) +{ + WARN_ON(sector % (bc->block_size / SECTOR_SIZE) != 0); + return sector >> (bc->block_shift - SECTOR_SHIFT); +} + +static int copy_data(struct bow_context const *bc, + struct bow_range *source, struct bow_range *dest, + u32 *checksum) +{ + int i; + + if (range_size(source) != range_size(dest)) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + if (checksum) + *checksum = sector_to_page(bc, source->sector); + + for (i = 0; i < range_size(source) >> bc->block_shift; ++i) { + struct dm_buffer *read_buffer, *write_buffer; + u8 *read, *write; + sector_t page = sector_to_page(bc, source->sector) + i; + + read = dm_bufio_read(bc->bufio, page, &read_buffer); + if (IS_ERR(read)) { + DMERR("Cannot read page %lu", page); + return PTR_ERR(read); + } + + if (checksum) + *checksum = crc32(*checksum, read, bc->block_size); + + write = dm_bufio_new(bc->bufio, + sector_to_page(bc, dest->sector) + i, + &write_buffer); + if (IS_ERR(write)) { + DMERR("Cannot write sector"); + dm_bufio_release(read_buffer); + return PTR_ERR(write); + } + + memcpy(write, read, bc->block_size); + + dm_bufio_mark_buffer_dirty(write_buffer); + dm_bufio_release(write_buffer); + dm_bufio_release(read_buffer); + } + + dm_bufio_write_dirty_buffers(bc->bufio); + return BLK_STS_OK; +} + +/****** logging functions ******/ + +static int add_log_entry(struct bow_context *bc, sector_t source, sector_t dest, + unsigned int size, u32 checksum); + +static int backup_log_sector(struct bow_context *bc) +{ + struct bow_range *first_br, *free_br; + struct bvec_iter bi_iter; + u32 checksum = 0; + int ret; + + first_br = container_of(rb_first(&bc->ranges), struct bow_range, node); + + if (first_br->type != SECTOR0) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + if (range_size(first_br) != bc->block_size) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + free_br = find_free_range(bc); + /* No space left - return this error to userspace */ + if (!free_br) + return BLK_STS_NOSPC; + bi_iter.bi_sector = free_br->sector; + bi_iter.bi_size = bc->block_size; + ret = split_range(bc, &free_br, &bi_iter); + if (ret) + return ret; + if (bi_iter.bi_size != bc->block_size) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + ret = copy_data(bc, first_br, free_br, &checksum); + if (ret) + return ret; + + bc->log_sector->count = 0; + bc->log_sector->sequence++; + ret = add_log_entry(bc, first_br->sector, free_br->sector, + range_size(first_br), checksum); + if (ret) + return ret; + + set_type(bc, &free_br, BACKUP); + return BLK_STS_OK; +} + +static int add_log_entry(struct bow_context *bc, sector_t source, sector_t dest, + unsigned int size, u32 checksum) +{ + struct dm_buffer *sector_buffer; + u8 *sector; + + if (sizeof(struct log_sector) + + sizeof(struct log_entry) * (bc->log_sector->count + 1) + > bc->block_size) { + int ret = backup_log_sector(bc); + + if (ret) + return ret; + } + + sector = dm_bufio_new(bc->bufio, 0, §or_buffer); + if (IS_ERR(sector)) { + DMERR("Cannot write boot sector"); + dm_bufio_release(sector_buffer); + return BLK_STS_NOSPC; + } + + bc->log_sector->entries[bc->log_sector->count].source = source; + bc->log_sector->entries[bc->log_sector->count].dest = dest; + bc->log_sector->entries[bc->log_sector->count].size = size; + bc->log_sector->entries[bc->log_sector->count].checksum = checksum; + bc->log_sector->count++; + + memcpy(sector, bc->log_sector, bc->block_size); + dm_bufio_mark_buffer_dirty(sector_buffer); + dm_bufio_release(sector_buffer); + dm_bufio_write_dirty_buffers(bc->bufio); + return BLK_STS_OK; +} + +static int prepare_log(struct bow_context *bc) +{ + struct bow_range *free_br, *first_br; + struct bvec_iter bi_iter; + u32 checksum = 0; + int ret; + + /* Carve out first sector as log sector */ + first_br = container_of(rb_first(&bc->ranges), struct bow_range, node); + if (first_br->type != UNCHANGED) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + if (range_size(first_br) < bc->block_size) { + WARN_ON(1); + return BLK_STS_IOERR; + } + bi_iter.bi_sector = 0; + bi_iter.bi_size = bc->block_size; + ret = split_range(bc, &first_br, &bi_iter); + if (ret) + return ret; + first_br->type = SECTOR0; + if (range_size(first_br) != bc->block_size) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + /* Find free sector for active sector0 reads/writes */ + free_br = find_free_range(bc); + if (!free_br) + return BLK_STS_NOSPC; + bi_iter.bi_sector = free_br->sector; + bi_iter.bi_size = bc->block_size; + ret = split_range(bc, &free_br, &bi_iter); + if (ret) + return ret; + free_br->type = SECTOR0_CURRENT; + + /* Copy data */ + ret = copy_data(bc, first_br, free_br, NULL); + if (ret) + return ret; + + bc->log_sector->sector0 = free_br->sector; + + /* Find free sector to back up original sector zero */ + free_br = find_free_range(bc); + if (!free_br) + return BLK_STS_NOSPC; + bi_iter.bi_sector = free_br->sector; + bi_iter.bi_size = bc->block_size; + ret = split_range(bc, &free_br, &bi_iter); + if (ret) + return ret; + + /* Back up */ + ret = copy_data(bc, first_br, free_br, &checksum); + if (ret) + return ret; + + /* + * Set up our replacement boot sector - it will get written when we + * add the first log entry, which we do immediately + */ + bc->log_sector->magic = MAGIC; + bc->log_sector->header_version = HEADER_VERSION; + bc->log_sector->header_size = sizeof(*bc->log_sector); + bc->log_sector->block_size = bc->block_size; + bc->log_sector->count = 0; + bc->log_sector->sequence = 0; + + /* Add log entry */ + ret = add_log_entry(bc, first_br->sector, free_br->sector, + range_size(first_br), checksum); + if (ret) + return ret; + + set_type(bc, &free_br, BACKUP); + return BLK_STS_OK; +} + +static struct bow_range *find_sector0_current(struct bow_context *bc) +{ + struct bvec_iter bi_iter; + + bi_iter.bi_sector = bc->log_sector->sector0; + bi_iter.bi_size = bc->block_size; + return find_first_overlapping_range(&bc->ranges, &bi_iter); +} + +/****** sysfs interface functions ******/ + +static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct bow_context *bc = container_of(kobj, struct bow_context, + kobj_holder.kobj); + + return scnprintf(buf, PAGE_SIZE, "%d\n", atomic_read(&bc->state)); +} + +static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct bow_context *bc = container_of(kobj, struct bow_context, + kobj_holder.kobj); + enum state state, original_state; + int ret; + + state = buf[0] - '0'; + if (state < TRIM || state > COMMITTED) { + DMERR("State value %d out of range", state); + return -EINVAL; + } + + mutex_lock(&bc->ranges_lock); + original_state = atomic_read(&bc->state); + if (state != original_state + 1) { + DMERR("Invalid state change from %d to %d", + original_state, state); + ret = -EINVAL; + goto bad; + } + + DMINFO("Switching to state %s", state == CHECKPOINT ? "Checkpoint" + : state == COMMITTED ? "Committed" : "Unknown"); + + if (state == CHECKPOINT) { + ret = prepare_log(bc); + if (ret) { + DMERR("Failed to switch to checkpoint state"); + goto bad; + } + } else if (state == COMMITTED) { + struct bow_range *br = find_sector0_current(bc); + struct bow_range *sector0_br = + container_of(rb_first(&bc->ranges), struct bow_range, + node); + + ret = copy_data(bc, br, sector0_br, 0); + if (ret) { + DMERR("Failed to switch to committed state"); + goto bad; + } + } + atomic_inc(&bc->state); + ret = count; + +bad: + mutex_unlock(&bc->ranges_lock); + return ret; +} + +static ssize_t free_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct bow_context *bc = container_of(kobj, struct bow_context, + kobj_holder.kobj); + u64 trims_total; + + mutex_lock(&bc->ranges_lock); + trims_total = bc->trims_total; + mutex_unlock(&bc->ranges_lock); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", trims_total); +} + +static struct kobj_attribute attr_state = __ATTR_RW(state); +static struct kobj_attribute attr_free = __ATTR_RO(free); + +static struct attribute *bow_attrs[] = { + &attr_state.attr, + &attr_free.attr, + NULL +}; + +static struct kobj_type bow_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = bow_attrs, + .release = dm_kobject_release +}; + +/****** constructor/destructor ******/ + +static void dm_bow_dtr(struct dm_target *ti) +{ + struct bow_context *bc = (struct bow_context *) ti->private; + struct kobject *kobj; + + while (rb_first(&bc->ranges)) { + struct bow_range *br = container_of(rb_first(&bc->ranges), + struct bow_range, node); + + rb_erase(&br->node, &bc->ranges); + kfree(br); + } + if (bc->workqueue) + destroy_workqueue(bc->workqueue); + if (bc->bufio) + dm_bufio_client_destroy(bc->bufio); + + kobj = &bc->kobj_holder.kobj; + if (kobj->state_initialized) { + kobject_put(kobj); + wait_for_completion(dm_get_completion_from_kobject(kobj)); + } + + kfree(bc->log_sector); + kfree(bc); +} + +static int dm_bow_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct bow_context *bc; + struct bow_range *br; + int ret; + struct mapped_device *md = dm_table_get_md(ti->table); + + if (argc != 1) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + bc = kzalloc(sizeof(*bc), GFP_KERNEL); + if (!bc) { + ti->error = "Cannot allocate bow context"; + return -ENOMEM; + } + + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->num_write_same_bios = 1; + ti->private = bc; + + ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), + &bc->dev); + if (ret) { + ti->error = "Device lookup failed"; + goto bad; + } + + if (bc->dev->bdev->bd_queue->limits.max_discard_sectors == 0) { + bc->dev->bdev->bd_queue->limits.discard_granularity = 1 << 12; + bc->dev->bdev->bd_queue->limits.max_hw_discard_sectors = 1 << 15; + bc->dev->bdev->bd_queue->limits.max_discard_sectors = 1 << 15; + bc->forward_trims = false; + } else { + bc->forward_trims = true; + } + + bc->block_size = bc->dev->bdev->bd_queue->limits.logical_block_size; + bc->block_shift = ilog2(bc->block_size); + bc->log_sector = kzalloc(bc->block_size, GFP_KERNEL); + if (!bc->log_sector) { + ti->error = "Cannot allocate log sector"; + goto bad; + } + + init_completion(&bc->kobj_holder.completion); + ret = kobject_init_and_add(&bc->kobj_holder.kobj, &bow_ktype, + &disk_to_dev(dm_disk(md))->kobj, "%s", + "bow"); + if (ret) { + ti->error = "Cannot create sysfs node"; + goto bad; + } + + mutex_init(&bc->ranges_lock); + bc->ranges = RB_ROOT; + bc->bufio = dm_bufio_client_create(bc->dev->bdev, bc->block_size, 1, 0, + NULL, NULL); + if (IS_ERR(bc->bufio)) { + ti->error = "Cannot initialize dm-bufio"; + ret = PTR_ERR(bc->bufio); + bc->bufio = NULL; + goto bad; + } + + bc->workqueue = alloc_workqueue("dm-bow", + WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM + | WQ_UNBOUND, num_online_cpus()); + if (!bc->workqueue) { + ti->error = "Cannot allocate workqueue"; + ret = -ENOMEM; + goto bad; + } + + INIT_LIST_HEAD(&bc->trimmed_list); + + br = kzalloc(sizeof(*br), GFP_KERNEL); + if (!br) { + ti->error = "Cannot allocate ranges"; + ret = -ENOMEM; + goto bad; + } + + br->sector = ti->len; + br->type = TOP; + rb_link_node(&br->node, NULL, &bc->ranges.rb_node); + rb_insert_color(&br->node, &bc->ranges); + + br = kzalloc(sizeof(*br), GFP_KERNEL); + if (!br) { + ti->error = "Cannot allocate ranges"; + ret = -ENOMEM; + goto bad; + } + + br->sector = 0; + br->type = UNCHANGED; + rb_link_node(&br->node, bc->ranges.rb_node, + &bc->ranges.rb_node->rb_left); + rb_insert_color(&br->node, &bc->ranges); + + ti->discards_supported = true; + + return 0; + +bad: + dm_bow_dtr(ti); + return ret; +} + +/****** Handle writes ******/ + +static int prepare_unchanged_range(struct bow_context *bc, struct bow_range *br, + struct bvec_iter *bi_iter, + bool record_checksum) +{ + struct bow_range *backup_br; + struct bvec_iter backup_bi; + sector_t log_source, log_dest; + unsigned int log_size; + u32 checksum = 0; + int ret; + int original_type; + sector_t sector0; + + /* Find a free range */ + backup_br = find_free_range(bc); + if (!backup_br) + return BLK_STS_NOSPC; + + /* Carve out a backup range. This may be smaller than the br given */ + backup_bi.bi_sector = backup_br->sector; + backup_bi.bi_size = min(range_size(backup_br), (u64) bi_iter->bi_size); + ret = split_range(bc, &backup_br, &backup_bi); + if (ret) + return ret; + + /* + * Carve out a changed range. This will not be smaller than the backup + * br since the backup br is smaller than the source range and iterator + */ + bi_iter->bi_size = backup_bi.bi_size; + ret = split_range(bc, &br, bi_iter); + if (ret) + return ret; + if (range_size(br) != range_size(backup_br)) { + WARN_ON(1); + return BLK_STS_IOERR; + } + + + /* Copy data over */ + ret = copy_data(bc, br, backup_br, record_checksum ? &checksum : NULL); + if (ret) + return ret; + + /* Add an entry to the log */ + log_source = br->sector; + log_dest = backup_br->sector; + log_size = range_size(br); + + /* + * Set the types. Note that since set_type also amalgamates ranges + * we have to set both sectors to their final type before calling + * set_type on either + */ + original_type = br->type; + sector0 = backup_br->sector; + if (backup_br->type == TRIMMED) + list_del(&backup_br->trimmed_list); + backup_br->type = br->type == SECTOR0_CURRENT ? SECTOR0_CURRENT + : BACKUP; + br->type = CHANGED; + set_type(bc, &backup_br, backup_br->type); + + /* + * Add the log entry after marking the backup sector, since adding a log + * can cause another backup + */ + ret = add_log_entry(bc, log_source, log_dest, log_size, checksum); + if (ret) { + br->type = original_type; + return ret; + } + + /* Now it is safe to mark this backup successful */ + if (original_type == SECTOR0_CURRENT) + bc->log_sector->sector0 = sector0; + + set_type(bc, &br, br->type); + return ret; +} + +static int prepare_free_range(struct bow_context *bc, struct bow_range *br, + struct bvec_iter *bi_iter) +{ + int ret; + + ret = split_range(bc, &br, bi_iter); + if (ret) + return ret; + set_type(bc, &br, CHANGED); + return BLK_STS_OK; +} + +static int prepare_changed_range(struct bow_context *bc, struct bow_range *br, + struct bvec_iter *bi_iter) +{ + /* Nothing to do ... */ + return BLK_STS_OK; +} + +static int prepare_one_range(struct bow_context *bc, + struct bvec_iter *bi_iter) +{ + struct bow_range *br = find_first_overlapping_range(&bc->ranges, + bi_iter); + switch (br->type) { + case CHANGED: + return prepare_changed_range(bc, br, bi_iter); + + case TRIMMED: + return prepare_free_range(bc, br, bi_iter); + + case UNCHANGED: + case BACKUP: + return prepare_unchanged_range(bc, br, bi_iter, true); + + /* + * We cannot track the checksum for the active sector0, since it + * may change at any point. + */ + case SECTOR0_CURRENT: + return prepare_unchanged_range(bc, br, bi_iter, false); + + case SECTOR0: /* Handled in the dm_bow_map */ + case TOP: /* Illegal - top is off the end of the device */ + default: + WARN_ON(1); + return BLK_STS_IOERR; + } +} + +struct write_work { + struct work_struct work; + struct bow_context *bc; + struct bio *bio; +}; + +static void bow_write(struct work_struct *work) +{ + struct write_work *ww = container_of(work, struct write_work, work); + struct bow_context *bc = ww->bc; + struct bio *bio = ww->bio; + struct bvec_iter bi_iter = bio->bi_iter; + int ret = BLK_STS_OK; + + kfree(ww); + + mutex_lock(&bc->ranges_lock); + do { + ret = prepare_one_range(bc, &bi_iter); + bi_iter.bi_sector += bi_iter.bi_size / SECTOR_SIZE; + bi_iter.bi_size = bio->bi_iter.bi_size + - (bi_iter.bi_sector - bio->bi_iter.bi_sector) + * SECTOR_SIZE; + } while (!ret && bi_iter.bi_size); + + mutex_unlock(&bc->ranges_lock); + + if (!ret) { + bio_set_dev(bio, bc->dev->bdev); + submit_bio(bio); + } else { + DMERR("Write failure with error %d", -ret); + bio->bi_status = ret; + bio_endio(bio); + } +} + +static int queue_write(struct bow_context *bc, struct bio *bio) +{ + struct write_work *ww = kmalloc(sizeof(*ww), GFP_NOIO | __GFP_NORETRY + | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (!ww) { + DMERR("Failed to allocate write_work"); + return -ENOMEM; + } + + INIT_WORK(&ww->work, bow_write); + ww->bc = bc; + ww->bio = bio; + queue_work(bc->workqueue, &ww->work); + return DM_MAPIO_SUBMITTED; +} + +static int handle_sector0(struct bow_context *bc, struct bio *bio) +{ + int ret = DM_MAPIO_REMAPPED; + + if (bio->bi_iter.bi_size > bc->block_size) { + struct bio * split = bio_split(bio, + bc->block_size >> SECTOR_SHIFT, + GFP_NOIO, + &fs_bio_set); + if (!split) { + DMERR("Failed to split bio"); + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + + bio_chain(split, bio); + split->bi_iter.bi_sector = bc->log_sector->sector0; + bio_set_dev(split, bc->dev->bdev); + submit_bio(split); + + if (bio_data_dir(bio) == WRITE) + ret = queue_write(bc, bio); + } else { + bio->bi_iter.bi_sector = bc->log_sector->sector0; + } + + return ret; +} + +static int add_trim(struct bow_context *bc, struct bio *bio) +{ + struct bow_range *br; + struct bvec_iter bi_iter = bio->bi_iter; + + DMDEBUG("add_trim: %lu, %u", + bio->bi_iter.bi_sector, bio->bi_iter.bi_size); + + do { + br = find_first_overlapping_range(&bc->ranges, &bi_iter); + + switch (br->type) { + case UNCHANGED: + if (!split_range(bc, &br, &bi_iter)) + set_type(bc, &br, TRIMMED); + break; + + case TRIMMED: + /* Nothing to do */ + break; + + default: + /* No other case is legal in TRIM state */ + WARN_ON(true); + break; + } + + bi_iter.bi_sector += bi_iter.bi_size / SECTOR_SIZE; + bi_iter.bi_size = bio->bi_iter.bi_size + - (bi_iter.bi_sector - bio->bi_iter.bi_sector) + * SECTOR_SIZE; + + } while (bi_iter.bi_size); + + bio_endio(bio); + return DM_MAPIO_SUBMITTED; +} + +static int remove_trim(struct bow_context *bc, struct bio *bio) +{ + struct bow_range *br; + struct bvec_iter bi_iter = bio->bi_iter; + + DMDEBUG("remove_trim: %lu, %u", + bio->bi_iter.bi_sector, bio->bi_iter.bi_size); + + do { + br = find_first_overlapping_range(&bc->ranges, &bi_iter); + + switch (br->type) { + case UNCHANGED: + /* Nothing to do */ + break; + + case TRIMMED: + if (!split_range(bc, &br, &bi_iter)) + set_type(bc, &br, UNCHANGED); + break; + + default: + /* No other case is legal in TRIM state */ + WARN_ON(true); + break; + } + + bi_iter.bi_sector += bi_iter.bi_size / SECTOR_SIZE; + bi_iter.bi_size = bio->bi_iter.bi_size + - (bi_iter.bi_sector - bio->bi_iter.bi_sector) + * SECTOR_SIZE; + + } while (bi_iter.bi_size); + + return DM_MAPIO_REMAPPED; +} + +int remap_unless_illegal_trim(struct bow_context *bc, struct bio *bio) +{ + if (!bc->forward_trims && bio_op(bio) == REQ_OP_DISCARD) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } else { + bio_set_dev(bio, bc->dev->bdev); + return DM_MAPIO_REMAPPED; + } +} + +/****** dm interface ******/ + +static int dm_bow_map(struct dm_target *ti, struct bio *bio) +{ + int ret = DM_MAPIO_REMAPPED; + struct bow_context *bc = ti->private; + + if (likely(bc->state.counter == COMMITTED)) + return remap_unless_illegal_trim(bc, bio); + + if (bio_data_dir(bio) == READ && bio->bi_iter.bi_sector != 0) + return remap_unless_illegal_trim(bc, bio); + + if (atomic_read(&bc->state) != COMMITTED) { + enum state state; + + mutex_lock(&bc->ranges_lock); + state = atomic_read(&bc->state); + if (state == TRIM) { + if (bio_op(bio) == REQ_OP_DISCARD) + ret = add_trim(bc, bio); + else if (bio_data_dir(bio) == WRITE) + ret = remove_trim(bc, bio); + else + /* pass-through */; + } else if (state == CHECKPOINT) { + if (bio->bi_iter.bi_sector == 0) + ret = handle_sector0(bc, bio); + else if (bio_data_dir(bio) == WRITE) + ret = queue_write(bc, bio); + else + /* pass-through */; + } else { + /* pass-through */ + } + mutex_unlock(&bc->ranges_lock); + } + + if (ret == DM_MAPIO_REMAPPED) + return remap_unless_illegal_trim(bc, bio); + + return ret; +} + +static void dm_bow_tablestatus(struct dm_target *ti, char *result, + unsigned int maxlen) +{ + char *end = result + maxlen; + struct bow_context *bc = ti->private; + struct rb_node *i; + int trimmed_list_length = 0; + int trimmed_range_count = 0; + struct bow_range *br; + + if (maxlen == 0) + return; + result[0] = 0; + + list_for_each_entry(br, &bc->trimmed_list, trimmed_list) + if (br->type == TRIMMED) { + ++trimmed_list_length; + } else { + scnprintf(result, end - result, + "ERROR: non-trimmed entry in trimmed_list"); + return; + } + + if (!rb_first(&bc->ranges)) { + scnprintf(result, end - result, "ERROR: Empty ranges"); + return; + } + + if (container_of(rb_first(&bc->ranges), struct bow_range, node) + ->sector) { + scnprintf(result, end - result, + "ERROR: First range does not start at sector 0"); + return; + } + + for (i = rb_first(&bc->ranges); i; i = rb_next(i)) { + struct bow_range *br = container_of(i, struct bow_range, node); + + result += scnprintf(result, end - result, "%s: %lu", + readable_type[br->type], br->sector); + if (result >= end) + return; + + result += scnprintf(result, end - result, "\n"); + if (result >= end) + return; + + if (br->type == TRIMMED) + ++trimmed_range_count; + + if (br->type == TOP) { + if (br->sector != ti->len) { + scnprintf(result, end - result, + "\nERROR: Top sector is incorrect"); + } + + if (&br->node != rb_last(&bc->ranges)) { + scnprintf(result, end - result, + "\nERROR: Top sector is not last"); + } + + break; + } + + if (!rb_next(i)) { + scnprintf(result, end - result, + "\nERROR: Last range not of type TOP"); + return; + } + + if (br->sector > range_top(br)) { + scnprintf(result, end - result, + "\nERROR: sectors out of order"); + return; + } + } + + if (trimmed_range_count != trimmed_list_length) + scnprintf(result, end - result, + "\nERROR: not all trimmed ranges in trimmed list"); +} + +static void dm_bow_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, + unsigned int maxlen) +{ + switch (type) { + case STATUSTYPE_INFO: + if (maxlen) + result[0] = 0; + break; + + case STATUSTYPE_TABLE: + dm_bow_tablestatus(ti, result, maxlen); + break; + } +} + +int dm_bow_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +{ + struct bow_context *bc = ti->private; + struct dm_dev *dev = bc->dev; + + *bdev = dev->bdev; + /* Only pass ioctls through if the device sizes match exactly. */ + return ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; +} + +static int dm_bow_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct bow_context *bc = ti->private; + + return fn(ti, bc->dev, 0, ti->len, data); +} + +static struct target_type bow_target = { + .name = "bow", + .version = {1, 1, 1}, + .module = THIS_MODULE, + .ctr = dm_bow_ctr, + .dtr = dm_bow_dtr, + .map = dm_bow_map, + .status = dm_bow_status, + .prepare_ioctl = dm_bow_prepare_ioctl, + .iterate_devices = dm_bow_iterate_devices, +}; + +int __init dm_bow_init(void) +{ + int r = dm_register_target(&bow_target); + + if (r < 0) + DMERR("registering bow failed %d", r); + return r; +} + +void dm_bow_exit(void) +{ + dm_unregister_target(&bow_target); +} + +MODULE_LICENSE("GPL"); + +module_init(dm_bow_init); +module_exit(dm_bow_exit); From bd00bd48d4a837c73a3e388f513cc8758ace98b0 Mon Sep 17 00:00:00 2001 From: Paul Lawrence Date: Fri, 22 Mar 2019 08:25:28 -0700 Subject: [PATCH 057/134] ANDROID: dm-bow: backport to 4.14 Change-Id: Ib89b91adaa11b84744de9167bda57ff0056f4415 Signed-off-by: Paul Lawrence --- drivers/md/dm-bow.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-bow.c b/drivers/md/dm-bow.c index 0d176aa140df..fa0675d58853 100644 --- a/drivers/md/dm-bow.c +++ b/drivers/md/dm-bow.c @@ -5,13 +5,14 @@ */ #include "dm.h" +#include "dm-bufio.h" #include "dm-core.h" #include -#include #include #define DM_MSG_PREFIX "bow" +#define SECTOR_SIZE 512 struct log_entry { u64 source; @@ -925,7 +926,7 @@ static int handle_sector0(struct bow_context *bc, struct bio *bio) struct bio * split = bio_split(bio, bc->block_size >> SECTOR_SHIFT, GFP_NOIO, - &fs_bio_set); + fs_bio_set); if (!split) { DMERR("Failed to split bio"); bio->bi_status = BLK_STS_RESOURCE; @@ -1176,7 +1177,8 @@ static void dm_bow_status(struct dm_target *ti, status_type_t type, } } -int dm_bow_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +int dm_bow_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + fmode_t *mode) { struct bow_context *bc = ti->private; struct dm_dev *dev = bc->dev; From af6f1cbdedef485ccc40bcb741a61d85160750e4 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 8 Feb 2019 10:35:14 -0800 Subject: [PATCH 058/134] UPSTREAM: binder: create userspace-to-binder-buffer copy function The binder driver uses a vm_area to map the per-process binder buffer space. For 32-bit android devices, this is now taking too much vmalloc space. This patch removes the use of vm_area when copying the transaction data from the sender to the buffer space. Instead of using copy_from_user() for multi-page copies, it now uses binder_alloc_copy_user_to_buffer() which uses kmap() and kunmap() to map each page, and uses copy_from_user() for copying to that page. (cherry picked from 1a7c3d9bb7a926e88d5f57643e75ad1abfc55013) Bug: 67668716 Signed-off-by: Todd Kjos Signed-off-by: Greg Kroah-Hartman Change-Id: I59ff83455984fce4626476e30601ed8b99858a92 --- drivers/android/binder.c | 29 +++++++-- drivers/android/binder_alloc.c | 113 +++++++++++++++++++++++++++++++++ drivers/android/binder_alloc.h | 8 +++ 3 files changed, 143 insertions(+), 7 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index c7478aeb9b10..264b5aaa12da 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -3177,8 +3177,12 @@ static void binder_transaction(struct binder_proc *proc, ALIGN(tr->data_size, sizeof(void *))); offp = off_start; - if (copy_from_user(t->buffer->data, (const void __user *)(uintptr_t) - tr->data.ptr.buffer, tr->data_size)) { + if (binder_alloc_copy_user_to_buffer( + &target_proc->alloc, + t->buffer, 0, + (const void __user *) + (uintptr_t)tr->data.ptr.buffer, + tr->data_size)) { binder_user_error("%d:%d got transaction with invalid data ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; @@ -3186,8 +3190,13 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_copy_data_failed; } - if (copy_from_user(offp, (const void __user *)(uintptr_t) - tr->data.ptr.offsets, tr->offsets_size)) { + if (binder_alloc_copy_user_to_buffer( + &target_proc->alloc, + t->buffer, + ALIGN(tr->data_size, sizeof(void *)), + (const void __user *) + (uintptr_t)tr->data.ptr.offsets, + tr->offsets_size)) { binder_user_error("%d:%d got transaction with invalid offsets ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; @@ -3317,6 +3326,8 @@ static void binder_transaction(struct binder_proc *proc, struct binder_buffer_object *bp = to_binder_buffer_object(hdr); size_t buf_left = sg_buf_end - sg_bufp; + binder_size_t sg_buf_offset = (uintptr_t)sg_bufp - + (uintptr_t)t->buffer->data; if (bp->length > buf_left) { binder_user_error("%d:%d got transaction with too large buffer\n", @@ -3326,9 +3337,13 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_bad_offset; } - if (copy_from_user(sg_bufp, - (const void __user *)(uintptr_t) - bp->buffer, bp->length)) { + if (binder_alloc_copy_user_to_buffer( + &target_proc->alloc, + t->buffer, + sg_buf_offset, + (const void __user *) + (uintptr_t)bp->buffer, + bp->length)) { binder_user_error("%d:%d got transaction with invalid offsets ptr\n", proc->pid, thread->pid); return_error_param = -EFAULT; diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index e00d4d13810a..248bce37c252 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include "binder_alloc.h" #include "binder_trace.h" @@ -1042,3 +1044,114 @@ int binder_alloc_shrinker_init(void) } return ret; } + +/** + * check_buffer() - verify that buffer/offset is safe to access + * @alloc: binder_alloc for this proc + * @buffer: binder buffer to be accessed + * @offset: offset into @buffer data + * @bytes: bytes to access from offset + * + * Check that the @offset/@bytes are within the size of the given + * @buffer and that the buffer is currently active and not freeable. + * Offsets must also be multiples of sizeof(u32). The kernel is + * allowed to touch the buffer in two cases: + * + * 1) when the buffer is being created: + * (buffer->free == 0 && buffer->allow_user_free == 0) + * 2) when the buffer is being torn down: + * (buffer->free == 0 && buffer->transaction == NULL). + * + * Return: true if the buffer is safe to access + */ +static inline bool check_buffer(struct binder_alloc *alloc, + struct binder_buffer *buffer, + binder_size_t offset, size_t bytes) +{ + size_t buffer_size = binder_alloc_buffer_size(alloc, buffer); + + return buffer_size >= bytes && + offset <= buffer_size - bytes && + IS_ALIGNED(offset, sizeof(u32)) && + !buffer->free && + (!buffer->allow_user_free || !buffer->transaction); +} + +/** + * binder_alloc_get_page() - get kernel pointer for given buffer offset + * @alloc: binder_alloc for this proc + * @buffer: binder buffer to be accessed + * @buffer_offset: offset into @buffer data + * @pgoffp: address to copy final page offset to + * + * Lookup the struct page corresponding to the address + * at @buffer_offset into @buffer->data. If @pgoffp is not + * NULL, the byte-offset into the page is written there. + * + * The caller is responsible to ensure that the offset points + * to a valid address within the @buffer and that @buffer is + * not freeable by the user. Since it can't be freed, we are + * guaranteed that the corresponding elements of @alloc->pages[] + * cannot change. + * + * Return: struct page + */ +static struct page *binder_alloc_get_page(struct binder_alloc *alloc, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + pgoff_t *pgoffp) +{ + binder_size_t buffer_space_offset = buffer_offset + + (buffer->data - alloc->buffer); + pgoff_t pgoff = buffer_space_offset & ~PAGE_MASK; + size_t index = buffer_space_offset >> PAGE_SHIFT; + struct binder_lru_page *lru_page; + + lru_page = &alloc->pages[index]; + *pgoffp = pgoff; + return lru_page->page_ptr; +} + +/** + * binder_alloc_copy_user_to_buffer() - copy src user to tgt user + * @alloc: binder_alloc for this proc + * @buffer: binder buffer to be accessed + * @buffer_offset: offset into @buffer data + * @from: userspace pointer to source buffer + * @bytes: bytes to copy + * + * Copy bytes from source userspace to target buffer. + * + * Return: bytes remaining to be copied + */ +unsigned long +binder_alloc_copy_user_to_buffer(struct binder_alloc *alloc, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + const void __user *from, + size_t bytes) +{ + if (!check_buffer(alloc, buffer, buffer_offset, bytes)) + return bytes; + + while (bytes) { + unsigned long size; + unsigned long ret; + struct page *page; + pgoff_t pgoff; + void *kptr; + + page = binder_alloc_get_page(alloc, buffer, + buffer_offset, &pgoff); + size = min_t(size_t, bytes, PAGE_SIZE - pgoff); + kptr = kmap(page) + pgoff; + ret = copy_from_user(kptr, from, size); + kunmap(page); + if (ret) + return bytes - size + ret; + bytes -= size; + from += size; + buffer_offset += size; + } + return 0; +} diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index fb3238c74c8a..b37a3e327003 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -22,6 +22,7 @@ #include #include #include +#include extern struct list_lru binder_alloc_lru; struct binder_transaction; @@ -183,5 +184,12 @@ binder_alloc_get_user_buffer_offset(struct binder_alloc *alloc) return alloc->user_buffer_offset; } +unsigned long +binder_alloc_copy_user_to_buffer(struct binder_alloc *alloc, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + const void __user *from, + size_t bytes); + #endif /* _LINUX_BINDER_ALLOC_H */ From 4d01f6b116f24693b09172878a8b03adb65a54cc Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 8 Feb 2019 10:35:15 -0800 Subject: [PATCH 059/134] BACKPORT: binder: add functions to copy to/from binder buffers Avoid vm_area when copying to or from binder buffers. Instead, new copy functions are added that copy from kernel space to binder buffer space. These use kmap_atomic() and kunmap_atomic() to create temporary mappings and then memcpy() is used to copy within that page. Also, kmap_atomic() / kunmap_atomic() use the appropriate cache flushing to support VIVT cache architectures. Allow binder to build if CPU_CACHE_VIVT is defined. Several uses of the new functions are added here. More to follow in subsequent patches. (cherry picked from commit 8ced0c6231ead26eca8cb416dcb7cc1c2cdd41d8) Bug: 67668716 Change-Id: I6a93d2396d0a80c352a1d563fc7fb523a753e38c Signed-off-by: Todd Kjos Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 88 +++++++++++++++++++++++++++------- drivers/android/binder_alloc.c | 59 +++++++++++++++++++++++ drivers/android/binder_alloc.h | 12 +++++ 3 files changed, 141 insertions(+), 18 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 264b5aaa12da..e91e26a32553 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2368,14 +2368,22 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, off_end = (void *)off_start + buffer->offsets_size; for (offp = off_start; offp < off_end; offp++) { struct binder_object_header *hdr; - size_t object_size = binder_validate_object(buffer, *offp); - + size_t object_size; + binder_size_t object_offset; + binder_size_t buffer_offset = (uintptr_t)offp - + (uintptr_t)buffer->data; + + binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, + buffer, buffer_offset, + sizeof(object_offset)); + object_size = binder_validate_object(buffer, object_offset); if (object_size == 0) { pr_err("transaction release %d bad object at offset %lld, size %zd\n", - debug_id, (u64)*offp, buffer->data_size); + debug_id, (u64)object_offset, buffer->data_size); continue; } - hdr = (struct binder_object_header *)(buffer->data + *offp); + hdr = (struct binder_object_header *) + (buffer->data + object_offset); switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { @@ -2469,8 +2477,20 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, continue; } fd_array = (u32 *)(parent_buffer + (uintptr_t)fda->parent_offset); - for (fd_index = 0; fd_index < fda->num_fds; fd_index++) - task_close_fd(proc, fd_array[fd_index]); + for (fd_index = 0; fd_index < fda->num_fds; + fd_index++) { + u32 fd; + binder_size_t offset = + (uintptr_t)&fd_array[fd_index] - + (uintptr_t)buffer->data; + + binder_alloc_copy_from_buffer(&proc->alloc, + &fd, + buffer, + offset, + sizeof(fd)); + task_close_fd(proc, fd); + } } break; default: pr_err("transaction release %d bad object type %x\n", @@ -2699,11 +2719,21 @@ static int binder_translate_fd_array(struct binder_fd_array_object *fda, return -EINVAL; } for (fdi = 0; fdi < fda->num_fds; fdi++) { - target_fd = binder_translate_fd(fd_array[fdi], t, thread, - in_reply_to); + u32 fd; + int target_fd; + binder_size_t offset = + (uintptr_t)&fd_array[fdi] - + (uintptr_t)t->buffer->data; + + binder_alloc_copy_from_buffer(&target_proc->alloc, + &fd, t->buffer, + offset, sizeof(fd)); + target_fd = binder_translate_fd(fd, t, thread, in_reply_to); if (target_fd < 0) goto err_translate_fd_failed; - fd_array[fdi] = target_fd; + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, offset, + &target_fd, sizeof(fd)); } return 0; @@ -2713,8 +2743,17 @@ err_translate_fd_failed: * installed so far. */ num_installed_fds = fdi; - for (fdi = 0; fdi < num_installed_fds; fdi++) - task_close_fd(target_proc, fd_array[fdi]); + for (fdi = 0; fdi < num_installed_fds; fdi++) { + u32 fd; + binder_size_t offset = + (uintptr_t)&fd_array[fdi] - + (uintptr_t)t->buffer->data; + + binder_alloc_copy_from_buffer(&target_proc->alloc, + &fd, t->buffer, + offset, sizeof(fd)); + task_close_fd(target_proc, fd); + } return target_fd; } @@ -3165,7 +3204,9 @@ static void binder_transaction(struct binder_proc *proc, t->security_ctx = (uintptr_t)kptr + binder_alloc_get_user_buffer_offset(&target_proc->alloc); - memcpy(kptr, secctx, secctx_sz); + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, buf_offset, + secctx, secctx_sz); security_release_secctx(secctx, secctx_sz); secctx = NULL; } @@ -3227,11 +3268,21 @@ static void binder_transaction(struct binder_proc *proc, off_min = 0; for (; offp < off_end; offp++) { struct binder_object_header *hdr; - size_t object_size = binder_validate_object(t->buffer, *offp); - - if (object_size == 0 || *offp < off_min) { + size_t object_size; + binder_size_t object_offset; + binder_size_t buffer_offset = + (uintptr_t)offp - (uintptr_t)t->buffer->data; + + binder_alloc_copy_from_buffer(&target_proc->alloc, + &object_offset, + t->buffer, + buffer_offset, + sizeof(object_offset)); + object_size = binder_validate_object(t->buffer, object_offset); + if (object_size == 0 || object_offset < off_min) { binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n", - proc->pid, thread->pid, (u64)*offp, + proc->pid, thread->pid, + (u64)object_offset, (u64)off_min, (u64)t->buffer->data_size); return_error = BR_FAILED_REPLY; @@ -3240,8 +3291,9 @@ static void binder_transaction(struct binder_proc *proc, goto err_bad_offset; } - hdr = (struct binder_object_header *)(t->buffer->data + *offp); - off_min = *offp + object_size; + hdr = (struct binder_object_header *) + (t->buffer->data + object_offset); + off_min = object_offset + object_size; switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 248bce37c252..abd02fa49f3e 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1155,3 +1155,62 @@ binder_alloc_copy_user_to_buffer(struct binder_alloc *alloc, } return 0; } + +static void binder_alloc_do_buffer_copy(struct binder_alloc *alloc, + bool to_buffer, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + void *ptr, + size_t bytes) +{ + /* All copies must be 32-bit aligned and 32-bit size */ + BUG_ON(!check_buffer(alloc, buffer, buffer_offset, bytes)); + + while (bytes) { + unsigned long size; + struct page *page; + pgoff_t pgoff; + void *tmpptr; + void *base_ptr; + + page = binder_alloc_get_page(alloc, buffer, + buffer_offset, &pgoff); + size = min_t(size_t, bytes, PAGE_SIZE - pgoff); + base_ptr = kmap_atomic(page); + tmpptr = base_ptr + pgoff; + if (to_buffer) + memcpy(tmpptr, ptr, size); + else + memcpy(ptr, tmpptr, size); + /* + * kunmap_atomic() takes care of flushing the cache + * if this device has VIVT cache arch + */ + kunmap_atomic(base_ptr); + bytes -= size; + pgoff = 0; + ptr = ptr + size; + buffer_offset += size; + } +} + +void binder_alloc_copy_to_buffer(struct binder_alloc *alloc, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + void *src, + size_t bytes) +{ + binder_alloc_do_buffer_copy(alloc, true, buffer, buffer_offset, + src, bytes); +} + +void binder_alloc_copy_from_buffer(struct binder_alloc *alloc, + void *dest, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + size_t bytes) +{ + binder_alloc_do_buffer_copy(alloc, false, buffer, buffer_offset, + dest, bytes); +} + diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index b37a3e327003..a9bec2e923f7 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -191,5 +191,17 @@ binder_alloc_copy_user_to_buffer(struct binder_alloc *alloc, const void __user *from, size_t bytes); +void binder_alloc_copy_to_buffer(struct binder_alloc *alloc, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + void *src, + size_t bytes); + +void binder_alloc_copy_from_buffer(struct binder_alloc *alloc, + void *dest, + struct binder_buffer *buffer, + binder_size_t buffer_offset, + size_t bytes); + #endif /* _LINUX_BINDER_ALLOC_H */ From 5088f1355ea1c497c6f2dfbf05e05595424205f5 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 8 Feb 2019 10:35:16 -0800 Subject: [PATCH 060/134] BACKPORT: binder: add function to copy binder object from buffer When creating or tearing down a transaction, the binder driver examines objects in the buffer and takes appropriate action. To do this without needing to dereference pointers into the buffer, the local copies of the objects are needed. This patch introduces a function to validate and copy binder objects from the buffer to a local structure. (cherry pick from commit 7a67a39320dfba4b36d3be5dae4581194e650316) Bug: 67668716 Signed-off-by: Todd Kjos Signed-off-by: Greg Kroah-Hartman Change-Id: I42dfe238a2d20bdeff479068ca87a80e4577e64a --- drivers/android/binder.c | 75 +++++++++++++++++++++++++++++++--------- 1 file changed, 58 insertions(+), 17 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index e91e26a32553..58b2420ae45a 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -663,6 +663,26 @@ struct binder_transaction { spinlock_t lock; }; +/** + * struct binder_object - union of flat binder object types + * @hdr: generic object header + * @fbo: binder object (nodes and refs) + * @fdo: file descriptor object + * @bbo: binder buffer pointer + * @fdao: file descriptor array + * + * Used for type-independent object copies + */ +struct binder_object { + union { + struct binder_object_header hdr; + struct flat_binder_object fbo; + struct binder_fd_object fdo; + struct binder_buffer_object bbo; + struct binder_fd_array_object fdao; + }; +}; + /** * binder_proc_lock() - Acquire outer lock for given binder_proc * @proc: struct binder_proc to acquire @@ -2199,26 +2219,33 @@ static void binder_cleanup_transaction(struct binder_transaction *t, } /** - * binder_validate_object() - checks for a valid metadata object in a buffer. + * binder_get_object() - gets object and checks for valid metadata + * @proc: binder_proc owning the buffer * @buffer: binder_buffer that we're parsing. - * @offset: offset in the buffer at which to validate an object. + * @offset: offset in the @buffer at which to validate an object. + * @object: struct binder_object to read into * * Return: If there's a valid metadata object at @offset in @buffer, the - * size of that object. Otherwise, it returns zero. + * size of that object. Otherwise, it returns zero. The object + * is read into the struct binder_object pointed to by @object. */ -static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset) +static size_t binder_get_object(struct binder_proc *proc, + struct binder_buffer *buffer, + unsigned long offset, + struct binder_object *object) { - /* Check if we can read a header first */ + size_t read_size; struct binder_object_header *hdr; size_t object_size = 0; - if (buffer->data_size < sizeof(*hdr) || - offset > buffer->data_size - sizeof(*hdr) || - !IS_ALIGNED(offset, sizeof(u32))) + read_size = min_t(size_t, sizeof(*object), buffer->data_size - offset); + if (read_size < sizeof(*hdr)) return 0; + binder_alloc_copy_from_buffer(&proc->alloc, object, buffer, + offset, read_size); - /* Ok, now see if we can read a complete object. */ - hdr = (struct binder_object_header *)(buffer->data + offset); + /* Ok, now see if we read a complete object. */ + hdr = &object->hdr; switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: @@ -2369,6 +2396,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, for (offp = off_start; offp < off_end; offp++) { struct binder_object_header *hdr; size_t object_size; + struct binder_object object; binder_size_t object_offset; binder_size_t buffer_offset = (uintptr_t)offp - (uintptr_t)buffer->data; @@ -2376,14 +2404,14 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, buffer, buffer_offset, sizeof(object_offset)); - object_size = binder_validate_object(buffer, object_offset); + object_size = binder_get_object(proc, buffer, + object_offset, &object); if (object_size == 0) { pr_err("transaction release %d bad object at offset %lld, size %zd\n", debug_id, (u64)object_offset, buffer->data_size); continue; } - hdr = (struct binder_object_header *) - (buffer->data + object_offset); + hdr = &object.hdr; switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { @@ -3269,6 +3297,7 @@ static void binder_transaction(struct binder_proc *proc, for (; offp < off_end; offp++) { struct binder_object_header *hdr; size_t object_size; + struct binder_object object; binder_size_t object_offset; binder_size_t buffer_offset = (uintptr_t)offp - (uintptr_t)t->buffer->data; @@ -3278,7 +3307,8 @@ static void binder_transaction(struct binder_proc *proc, t->buffer, buffer_offset, sizeof(object_offset)); - object_size = binder_validate_object(t->buffer, object_offset); + object_size = binder_get_object(target_proc, t->buffer, + object_offset, &object); if (object_size == 0 || object_offset < off_min) { binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n", proc->pid, thread->pid, @@ -3291,8 +3321,7 @@ static void binder_transaction(struct binder_proc *proc, goto err_bad_offset; } - hdr = (struct binder_object_header *) - (t->buffer->data + object_offset); + hdr = &object.hdr; off_min = object_offset + object_size; switch (hdr->type) { case BINDER_TYPE_BINDER: @@ -3307,6 +3336,9 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_translate_failed; } + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, object_offset, + fp, sizeof(*fp)); } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { @@ -3320,6 +3352,9 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_translate_failed; } + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, object_offset, + fp, sizeof(*fp)); } break; case BINDER_TYPE_FD: { @@ -3335,6 +3370,9 @@ static void binder_transaction(struct binder_proc *proc, } fp->pad_binder = 0; fp->fd = target_fd; + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, object_offset, + fp, sizeof(*fp)); } break; case BINDER_TYPE_FDA: { struct binder_fd_array_object *fda = @@ -3419,7 +3457,10 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_translate_failed; } - last_fixup_obj = bp; + binder_alloc_copy_to_buffer(&target_proc->alloc, + t->buffer, object_offset, + bp, sizeof(*bp)); + last_fixup_obj = t->buffer->data + object_offset; last_fixup_min_off = 0; } break; default: From 5f46f335bd4da179c0e247a377d41ab62da769cb Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 8 Feb 2019 10:35:17 -0800 Subject: [PATCH 061/134] UPSTREAM: binder: avoid kernel vm_area for buffer fixups Refactor the functions to validate and fixup struct binder_buffer pointer objects to avoid using vm_area pointers. Instead copy to/from kernel space using binder_alloc_copy_to_buffer() and binder_alloc_copy_from_buffer(). The following functions were refactored: refactor binder_validate_ptr() binder_validate_fixup() binder_fixup_parent() (cherry pick from commit db6b0b810bf945d1991917ffce0e93383101f2fa) Bug: 67668716 Signed-off-by: Todd Kjos Signed-off-by: Greg Kroah-Hartman Change-Id: Ic222af9b6c56bf48fd0b65debe981d19a7809e77 --- drivers/android/binder.c | 146 ++++++++++++++++++++++++++------------- 1 file changed, 97 insertions(+), 49 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 58b2420ae45a..602f507d2248 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2274,10 +2274,13 @@ static size_t binder_get_object(struct binder_proc *proc, /** * binder_validate_ptr() - validates binder_buffer_object in a binder_buffer. + * @proc: binder_proc owning the buffer * @b: binder_buffer containing the object + * @object: struct binder_object to read into * @index: index in offset array at which the binder_buffer_object is * located - * @start: points to the start of the offset array + * @start_offset: points to the start of the offset array + * @object_offsetp: offset of @object read from @b * @num_valid: the number of valid offsets in the offset array * * Return: If @index is within the valid range of the offset array @@ -2288,34 +2291,46 @@ static size_t binder_get_object(struct binder_proc *proc, * Note that the offset found in index @index itself is not * verified; this function assumes that @num_valid elements * from @start were previously verified to have valid offsets. + * If @object_offsetp is non-NULL, then the offset within + * @b is written to it. */ -static struct binder_buffer_object *binder_validate_ptr(struct binder_buffer *b, - binder_size_t index, - binder_size_t *start, - binder_size_t num_valid) +static struct binder_buffer_object *binder_validate_ptr( + struct binder_proc *proc, + struct binder_buffer *b, + struct binder_object *object, + binder_size_t index, + binder_size_t start_offset, + binder_size_t *object_offsetp, + binder_size_t num_valid) { - struct binder_buffer_object *buffer_obj; - binder_size_t *offp; + size_t object_size; + binder_size_t object_offset; + unsigned long buffer_offset; if (index >= num_valid) return NULL; - offp = start + index; - buffer_obj = (struct binder_buffer_object *)(b->data + *offp); - if (buffer_obj->hdr.type != BINDER_TYPE_PTR) + buffer_offset = start_offset + sizeof(binder_size_t) * index; + binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, + b, buffer_offset, sizeof(object_offset)); + object_size = binder_get_object(proc, b, object_offset, object); + if (!object_size || object->hdr.type != BINDER_TYPE_PTR) return NULL; + if (object_offsetp) + *object_offsetp = object_offset; - return buffer_obj; + return &object->bbo; } /** * binder_validate_fixup() - validates pointer/fd fixups happen in order. + * @proc: binder_proc owning the buffer * @b: transaction buffer - * @objects_start start of objects buffer - * @buffer: binder_buffer_object in which to fix up - * @offset: start offset in @buffer to fix up - * @last_obj: last binder_buffer_object that we fixed up in - * @last_min_offset: minimum fixup offset in @last_obj + * @objects_start_offset: offset to start of objects buffer + * @buffer_obj_offset: offset to binder_buffer_object in which to fix up + * @fixup_offset: start offset in @buffer to fix up + * @last_obj_offset: offset to last binder_buffer_object that we fixed + * @last_min_offset: minimum fixup offset in object at @last_obj_offset * * Return: %true if a fixup in buffer @buffer at offset @offset is * allowed. @@ -2346,28 +2361,41 @@ static struct binder_buffer_object *binder_validate_ptr(struct binder_buffer *b, * C (parent = A, offset = 16) * D (parent = B, offset = 0) // B is not A or any of A's parents */ -static bool binder_validate_fixup(struct binder_buffer *b, - binder_size_t *objects_start, - struct binder_buffer_object *buffer, +static bool binder_validate_fixup(struct binder_proc *proc, + struct binder_buffer *b, + binder_size_t objects_start_offset, + binder_size_t buffer_obj_offset, binder_size_t fixup_offset, - struct binder_buffer_object *last_obj, + binder_size_t last_obj_offset, binder_size_t last_min_offset) { - if (!last_obj) { + if (!last_obj_offset) { /* Nothing to fix up in */ return false; } - while (last_obj != buffer) { + while (last_obj_offset != buffer_obj_offset) { + unsigned long buffer_offset; + struct binder_object last_object; + struct binder_buffer_object *last_bbo; + size_t object_size = binder_get_object(proc, b, last_obj_offset, + &last_object); + if (object_size != sizeof(*last_bbo)) + return false; + + last_bbo = &last_object.bbo; /* * Safe to retrieve the parent of last_obj, since it * was already previously verified by the driver. */ - if ((last_obj->flags & BINDER_BUFFER_FLAG_HAS_PARENT) == 0) + if ((last_bbo->flags & BINDER_BUFFER_FLAG_HAS_PARENT) == 0) return false; - last_min_offset = last_obj->parent_offset + sizeof(uintptr_t); - last_obj = (struct binder_buffer_object *) - (b->data + *(objects_start + last_obj->parent)); + last_min_offset = last_bbo->parent_offset + sizeof(uintptr_t); + buffer_offset = objects_start_offset + + sizeof(binder_size_t) * last_bbo->parent, + binder_alloc_copy_from_buffer(&proc->alloc, &last_obj_offset, + b, buffer_offset, + sizeof(last_obj_offset)); } return (fixup_offset >= last_min_offset); } @@ -2378,6 +2406,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, { binder_size_t *offp, *off_start, *off_end; int debug_id = buffer->debug_id; + binder_size_t off_start_offset; binder_debug(BINDER_DEBUG_TRANSACTION, "%d buffer release %d, size %zd-%zd, failed at %pK\n", @@ -2387,8 +2416,8 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, if (buffer->target_node) binder_dec_node(buffer->target_node, 1, 0); - off_start = (binder_size_t *)(buffer->data + - ALIGN(buffer->data_size, sizeof(void *))); + off_start_offset = ALIGN(buffer->data_size, sizeof(void *)); + off_start = (binder_size_t *)(buffer->data + off_start_offset); if (failed_at) off_end = failed_at; else @@ -2469,14 +2498,17 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, case BINDER_TYPE_FDA: { struct binder_fd_array_object *fda; struct binder_buffer_object *parent; + struct binder_object ptr_object; uintptr_t parent_buffer; u32 *fd_array; size_t fd_index; binder_size_t fd_buf_size; fda = to_binder_fd_array_object(hdr); - parent = binder_validate_ptr(buffer, fda->parent, - off_start, + parent = binder_validate_ptr(proc, buffer, &ptr_object, + fda->parent, + off_start_offset, + NULL, offp - off_start); if (!parent) { pr_err("transaction release %d bad parent offset", @@ -2788,9 +2820,9 @@ err_translate_fd_failed: static int binder_fixup_parent(struct binder_transaction *t, struct binder_thread *thread, struct binder_buffer_object *bp, - binder_size_t *off_start, + binder_size_t off_start_offset, binder_size_t num_valid, - struct binder_buffer_object *last_fixup_obj, + binder_size_t last_fixup_obj_off, binder_size_t last_fixup_min_off) { struct binder_buffer_object *parent; @@ -2798,20 +2830,25 @@ static int binder_fixup_parent(struct binder_transaction *t, struct binder_buffer *b = t->buffer; struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; + struct binder_object object; + binder_size_t buffer_offset; + binder_size_t parent_offset; if (!(bp->flags & BINDER_BUFFER_FLAG_HAS_PARENT)) return 0; - parent = binder_validate_ptr(b, bp->parent, off_start, num_valid); + parent = binder_validate_ptr(target_proc, b, &object, bp->parent, + off_start_offset, &parent_offset, + num_valid); if (!parent) { binder_user_error("%d:%d got transaction with invalid parent offset or type\n", proc->pid, thread->pid); return -EINVAL; } - if (!binder_validate_fixup(b, off_start, - parent, bp->parent_offset, - last_fixup_obj, + if (!binder_validate_fixup(target_proc, b, off_start_offset, + parent_offset, bp->parent_offset, + last_fixup_obj_off, last_fixup_min_off)) { binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n", proc->pid, thread->pid); @@ -2828,7 +2865,10 @@ static int binder_fixup_parent(struct binder_transaction *t, parent_buffer = (u8 *)((uintptr_t)parent->buffer - binder_alloc_get_user_buffer_offset( &target_proc->alloc)); - *(binder_uintptr_t *)(parent_buffer + bp->parent_offset) = bp->buffer; + buffer_offset = bp->parent_offset + + (uintptr_t)parent_buffer - (uintptr_t)b->data; + binder_alloc_copy_to_buffer(&target_proc->alloc, b, buffer_offset, + &bp->buffer, sizeof(bp->buffer)); return 0; } @@ -2954,6 +2994,7 @@ static void binder_transaction(struct binder_proc *proc, struct binder_transaction *t; struct binder_work *tcomplete; binder_size_t *offp, *off_end, *off_start; + binder_size_t off_start_offset; binder_size_t off_min; u8 *sg_bufp, *sg_buf_end; struct binder_proc *target_proc = NULL; @@ -2964,7 +3005,7 @@ static void binder_transaction(struct binder_proc *proc, uint32_t return_error = 0; uint32_t return_error_param = 0; uint32_t return_error_line = 0; - struct binder_buffer_object *last_fixup_obj = NULL; + binder_size_t last_fixup_obj_off = 0; binder_size_t last_fixup_min_off = 0; struct binder_context *context = proc->context; int t_debug_id = atomic_inc_return(&binder_last_id); @@ -3242,8 +3283,8 @@ static void binder_transaction(struct binder_proc *proc, t->buffer->transaction = t; t->buffer->target_node = target_node; trace_binder_transaction_alloc_buf(t->buffer); - off_start = (binder_size_t *)(t->buffer->data + - ALIGN(tr->data_size, sizeof(void *))); + off_start_offset = ALIGN(tr->data_size, sizeof(void *)); + off_start = (binder_size_t *)(t->buffer->data + off_start_offset); offp = off_start; if (binder_alloc_copy_user_to_buffer( @@ -3375,11 +3416,15 @@ static void binder_transaction(struct binder_proc *proc, fp, sizeof(*fp)); } break; case BINDER_TYPE_FDA: { + struct binder_object ptr_object; + binder_size_t parent_offset; struct binder_fd_array_object *fda = to_binder_fd_array_object(hdr); struct binder_buffer_object *parent = - binder_validate_ptr(t->buffer, fda->parent, - off_start, + binder_validate_ptr(target_proc, t->buffer, + &ptr_object, fda->parent, + off_start_offset, + &parent_offset, offp - off_start); if (!parent) { binder_user_error("%d:%d got transaction with invalid parent offset or type\n", @@ -3389,9 +3434,11 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_bad_parent; } - if (!binder_validate_fixup(t->buffer, off_start, - parent, fda->parent_offset, - last_fixup_obj, + if (!binder_validate_fixup(target_proc, t->buffer, + off_start_offset, + parent_offset, + fda->parent_offset, + last_fixup_obj_off, last_fixup_min_off)) { binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n", proc->pid, thread->pid); @@ -3408,7 +3455,7 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_translate_failed; } - last_fixup_obj = parent; + last_fixup_obj_off = parent_offset; last_fixup_min_off = fda->parent_offset + sizeof(u32) * fda->num_fds; } break; @@ -3447,9 +3494,10 @@ static void binder_transaction(struct binder_proc *proc, &target_proc->alloc); sg_bufp += ALIGN(bp->length, sizeof(u64)); - ret = binder_fixup_parent(t, thread, bp, off_start, + ret = binder_fixup_parent(t, thread, bp, + off_start_offset, offp - off_start, - last_fixup_obj, + last_fixup_obj_off, last_fixup_min_off); if (ret < 0) { return_error = BR_FAILED_REPLY; @@ -3460,7 +3508,7 @@ static void binder_transaction(struct binder_proc *proc, binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, bp, sizeof(*bp)); - last_fixup_obj = t->buffer->data + object_offset; + last_fixup_obj_off = object_offset; last_fixup_min_off = 0; } break; default: From cfae4296c076ce1bc60115ed3b542bd754e14dcc Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 8 Feb 2019 10:35:18 -0800 Subject: [PATCH 062/134] UPSTREAM: binder: remove kernel vm_area for buffer space Remove the kernel's vm_area and the code that maps buffer pages into it. (cherry pick from commit 880211667b203dd32724f3be224c44c0400aa0a6) Bug: 67668716 Signed-off-by: Todd Kjos Signed-off-by: Greg Kroah-Hartman Change-Id: I2595bb8416c2bbfcf97ad3d7380ae94e29c209fb --- drivers/android/binder_alloc.c | 40 ++-------------------------------- 1 file changed, 2 insertions(+), 38 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index abd02fa49f3e..ebdeb03df682 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -262,16 +262,6 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, page->alloc = alloc; INIT_LIST_HEAD(&page->lru); - ret = map_kernel_range_noflush((unsigned long)page_addr, - PAGE_SIZE, PAGE_KERNEL, - &page->page_ptr); - flush_cache_vmap((unsigned long)page_addr, - (unsigned long)page_addr + PAGE_SIZE); - if (ret != 1) { - pr_err("%d: binder_alloc_buf failed to map page at %pK in kernel\n", - alloc->pid, page_addr); - goto err_map_kernel_failed; - } user_page_addr = (uintptr_t)page_addr + alloc->user_buffer_offset; ret = vm_insert_page(vma, user_page_addr, page[0].page_ptr); @@ -311,8 +301,6 @@ free_range: continue; err_vm_insert_page_failed: - unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); -err_map_kernel_failed: __free_page(page->page_ptr); page->page_ptr = NULL; err_alloc_page_failed: @@ -687,7 +675,6 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, struct vm_area_struct *vma) { int ret; - struct vm_struct *area; const char *failure_string; struct binder_buffer *buffer; @@ -698,28 +685,10 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, goto err_already_mapped; } - area = get_vm_area(vma->vm_end - vma->vm_start, VM_ALLOC); - if (area == NULL) { - ret = -ENOMEM; - failure_string = "get_vm_area"; - goto err_get_vm_area_failed; - } - alloc->buffer = area->addr; - alloc->user_buffer_offset = - vma->vm_start - (uintptr_t)alloc->buffer; + alloc->buffer = (void *)vma->vm_start; + alloc->user_buffer_offset = 0; mutex_unlock(&binder_alloc_mmap_lock); -#ifdef CONFIG_CPU_CACHE_VIPT - if (cache_is_vipt_aliasing()) { - while (CACHE_COLOUR( - (vma->vm_start ^ (uint32_t)alloc->buffer))) { - pr_info("%s: %d %lx-%lx maps %pK bad alignment\n", - __func__, alloc->pid, vma->vm_start, - vma->vm_end, alloc->buffer); - vma->vm_start += PAGE_SIZE; - } - } -#endif alloc->pages = kzalloc(sizeof(alloc->pages[0]) * ((vma->vm_end - vma->vm_start) / PAGE_SIZE), GFP_KERNEL); @@ -752,9 +721,7 @@ err_alloc_buf_struct_failed: alloc->pages = NULL; err_alloc_pages_failed: mutex_lock(&binder_alloc_mmap_lock); - vfree(alloc->buffer); alloc->buffer = NULL; -err_get_vm_area_failed: err_already_mapped: mutex_unlock(&binder_alloc_mmap_lock); pr_err("%s: %d %lx-%lx %s failed %d\n", __func__, @@ -811,12 +778,10 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) "%s: %d: page %d at %pK %s\n", __func__, alloc->pid, i, page_addr, on_lru ? "on lru" : "active"); - unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); __free_page(alloc->pages[i].page_ptr); page_count++; } kfree(alloc->pages); - vfree(alloc->buffer); } mutex_unlock(&alloc->mutex); if (alloc->vma_vm_mm) @@ -977,7 +942,6 @@ enum lru_status binder_alloc_free_page(struct list_head *item, trace_binder_unmap_kernel_start(alloc, index); - unmap_kernel_range(page_addr, PAGE_SIZE); __free_page(page->page_ptr); page->page_ptr = NULL; From cd84b4e06966f3cf0b7ffd896c28c283082d9cc1 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 8 Feb 2019 10:35:19 -0800 Subject: [PATCH 063/134] BACKPORT: binder: remove user_buffer_offset Remove user_buffer_offset since there is no kernel buffer pointer anymore. (cherry pick from commit c41358a5f5217abd7c051e8d42397e5b80f3b3ed) Bug: 67668716 Signed-off-by: Todd Kjos Signed-off-by: Greg Kroah-Hartman Change-Id: I399219867704dc5013453a7738193c742fc970ad --- drivers/android/binder.c | 39 ++++++---------------------------- drivers/android/binder_alloc.c | 16 ++++++-------- drivers/android/binder_alloc.h | 23 -------------------- 3 files changed, 13 insertions(+), 65 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 602f507d2248..d9f4d330fed9 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2499,7 +2499,6 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, struct binder_fd_array_object *fda; struct binder_buffer_object *parent; struct binder_object ptr_object; - uintptr_t parent_buffer; u32 *fd_array; size_t fd_index; binder_size_t fd_buf_size; @@ -2515,14 +2514,6 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, debug_id); continue; } - /* - * Since the parent was already fixed up, convert it - * back to kernel address space to access it - */ - parent_buffer = parent->buffer - - binder_alloc_get_user_buffer_offset( - &proc->alloc); - fd_buf_size = sizeof(u32) * fda->num_fds; if (fda->num_fds >= SIZE_MAX / sizeof(u32)) { pr_err("transaction release %d invalid number of fds (%lld)\n", @@ -2536,7 +2527,8 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, debug_id, (u64)fda->num_fds); continue; } - fd_array = (u32 *)(parent_buffer + (uintptr_t)fda->parent_offset); + fd_array = (u32 *)(uintptr_t) + (parent->buffer + fda->parent_offset); for (fd_index = 0; fd_index < fda->num_fds; fd_index++) { u32 fd; @@ -2748,7 +2740,6 @@ static int binder_translate_fd_array(struct binder_fd_array_object *fda, { binder_size_t fdi, fd_buf_size, num_installed_fds; int target_fd; - uintptr_t parent_buffer; u32 *fd_array; struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; @@ -2766,13 +2757,7 @@ static int binder_translate_fd_array(struct binder_fd_array_object *fda, proc->pid, thread->pid, (u64)fda->num_fds); return -EINVAL; } - /* - * Since the parent was already fixed up, convert it - * back to the kernel address space to access it - */ - parent_buffer = parent->buffer - - binder_alloc_get_user_buffer_offset(&target_proc->alloc); - fd_array = (u32 *)(parent_buffer + (uintptr_t)fda->parent_offset); + fd_array = (u32 *)(uintptr_t)(parent->buffer + fda->parent_offset); if (!IS_ALIGNED((unsigned long)fd_array, sizeof(u32))) { binder_user_error("%d:%d parent offset not aligned correctly.\n", proc->pid, thread->pid); @@ -2826,7 +2811,6 @@ static int binder_fixup_parent(struct binder_transaction *t, binder_size_t last_fixup_min_off) { struct binder_buffer_object *parent; - u8 *parent_buffer; struct binder_buffer *b = t->buffer; struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; @@ -2862,11 +2846,8 @@ static int binder_fixup_parent(struct binder_transaction *t, proc->pid, thread->pid); return -EINVAL; } - parent_buffer = (u8 *)((uintptr_t)parent->buffer - - binder_alloc_get_user_buffer_offset( - &target_proc->alloc)); buffer_offset = bp->parent_offset + - (uintptr_t)parent_buffer - (uintptr_t)b->data; + (uintptr_t)parent->buffer - (uintptr_t)b->data; binder_alloc_copy_to_buffer(&target_proc->alloc, b, buffer_offset, &bp->buffer, sizeof(bp->buffer)); @@ -3269,10 +3250,8 @@ static void binder_transaction(struct binder_proc *proc, ALIGN(tr->offsets_size, sizeof(void *)) + ALIGN(extra_buffers_size, sizeof(void *)) - ALIGN(secctx_sz, sizeof(u64)); - char *kptr = t->buffer->data + buf_offset; - t->security_ctx = (uintptr_t)kptr + - binder_alloc_get_user_buffer_offset(&target_proc->alloc); + t->security_ctx = (uintptr_t)t->buffer->data + buf_offset; binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, buf_offset, secctx, secctx_sz); @@ -3489,9 +3468,7 @@ static void binder_transaction(struct binder_proc *proc, goto err_copy_data_failed; } /* Fixup buffer pointer to target proc address space */ - bp->buffer = (uintptr_t)sg_bufp + - binder_alloc_get_user_buffer_offset( - &target_proc->alloc); + bp->buffer = (uintptr_t)sg_bufp; sg_bufp += ALIGN(bp->length, sizeof(u64)); ret = binder_fixup_parent(t, thread, bp, @@ -4475,9 +4452,7 @@ retry: trd->data_size = t->buffer->data_size; trd->offsets_size = t->buffer->offsets_size; - trd->data.ptr.buffer = (binder_uintptr_t) - ((uintptr_t)t->buffer->data + - binder_alloc_get_user_buffer_offset(&proc->alloc)); + trd->data.ptr.buffer = (uintptr_t)t->buffer->data; trd->data.ptr.offsets = trd->data.ptr.buffer + ALIGN(t->buffer->data_size, sizeof(void *)); diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index ebdeb03df682..d71588de38e4 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -136,17 +136,17 @@ static struct binder_buffer *binder_alloc_prepare_to_free_locked( { struct rb_node *n = alloc->allocated_buffers.rb_node; struct binder_buffer *buffer; - void *kern_ptr; + void *uptr; - kern_ptr = (void *)(user_ptr - alloc->user_buffer_offset); + uptr = (void *)user_ptr; while (n) { buffer = rb_entry(n, struct binder_buffer, rb_node); BUG_ON(buffer->free); - if (kern_ptr < buffer->data) + if (uptr < buffer->data) n = n->rb_left; - else if (kern_ptr > buffer->data) + else if (uptr > buffer->data) n = n->rb_right; else { /* @@ -262,8 +262,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, page->alloc = alloc; INIT_LIST_HEAD(&page->lru); - user_page_addr = - (uintptr_t)page_addr + alloc->user_buffer_offset; + user_page_addr = (uintptr_t)page_addr; ret = vm_insert_page(vma, user_page_addr, page[0].page_ptr); if (ret) { pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n", @@ -686,7 +685,6 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, } alloc->buffer = (void *)vma->vm_start; - alloc->user_buffer_offset = 0; mutex_unlock(&binder_alloc_mmap_lock); alloc->pages = kzalloc(sizeof(alloc->pages[0]) * @@ -930,9 +928,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, if (vma) { trace_binder_unmap_user_start(alloc, index); - zap_page_range(vma, - page_addr + alloc->user_buffer_offset, - PAGE_SIZE); + zap_page_range(vma, page_addr, PAGE_SIZE); trace_binder_unmap_user_end(alloc, index); diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index a9bec2e923f7..a6b158e55e46 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -82,7 +82,6 @@ struct binder_lru_page { * (invariant after init) * @vma_vm_mm: copy of vma->vm_mm (invarient after mmap) * @buffer: base of per-proc address space mapped via mmap - * @user_buffer_offset: offset between user and kernel VAs for buffer * @buffers: list of all buffers for this proc * @free_buffers: rb tree of buffers available for allocation * sorted by size @@ -104,7 +103,6 @@ struct binder_alloc { struct vm_area_struct *vma; struct mm_struct *vma_vm_mm; void *buffer; - ptrdiff_t user_buffer_offset; struct list_head buffers; struct rb_root free_buffers; struct rb_root allocated_buffers; @@ -163,27 +161,6 @@ binder_alloc_get_free_async_space(struct binder_alloc *alloc) return free_async_space; } -/** - * binder_alloc_get_user_buffer_offset() - get offset between kernel/user addrs - * @alloc: binder_alloc for this proc - * - * Return: the offset between kernel and user-space addresses to use for - * virtual address conversion - */ -static inline ptrdiff_t -binder_alloc_get_user_buffer_offset(struct binder_alloc *alloc) -{ - /* - * user_buffer_offset is constant if vma is set and - * undefined if vma is not set. It is possible to - * get here with !alloc->vma if the target process - * is dying while a transaction is being initiated. - * Returning the old value is ok in this case and - * the transaction will fail. - */ - return alloc->user_buffer_offset; -} - unsigned long binder_alloc_copy_user_to_buffer(struct binder_alloc *alloc, struct binder_buffer *buffer, From 5e696a32855ac6d7c49192ff337ba7b873fabede Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 5 Dec 2018 15:19:25 -0800 Subject: [PATCH 064/134] UPSTREAM: binder: fix kerneldoc header for struct binder_buffer Fix the incomplete kerneldoc header for struct binder_buffer. (cherry pick from commit 7a2670a5bc917e4e7c9be5274efc004f9bd1216a) Bug: 67668716 Signed-off-by: Todd Kjos Signed-off-by: Greg Kroah-Hartman Change-Id: I6bb942e6a9466b02653349943524462f205af839 --- drivers/android/binder_alloc.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index a6b158e55e46..1026e9fb20db 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -31,16 +31,16 @@ struct binder_transaction; * struct binder_buffer - buffer used for binder transactions * @entry: entry alloc->buffers * @rb_node: node for allocated_buffers/free_buffers rb trees - * @free: true if buffer is free - * @allow_user_free: describe the second member of struct blah, - * @async_transaction: describe the second member of struct blah, - * @debug_id: describe the second member of struct blah, - * @transaction: describe the second member of struct blah, - * @target_node: describe the second member of struct blah, - * @data_size: describe the second member of struct blah, - * @offsets_size: describe the second member of struct blah, - * @extra_buffers_size: describe the second member of struct blah, - * @data:i describe the second member of struct blah, + * @free: %true if buffer is free + * @allow_user_free: %true if user is allowed to free buffer + * @async_transaction: %true if buffer is in use for an async txn + * @debug_id: unique ID for debugging + * @transaction: pointer to associated struct binder_transaction + * @target_node: struct binder_node associated with this buffer + * @data_size: size of @transaction data + * @offsets_size: size of array of offsets + * @extra_buffers_size: size of space for other objects (like sg lists) + * @data: pointer to base of buffer space * * Bookkeeping structure for binder transaction buffers */ From 9fcfd1ec697b3c49fa08adeda15136d1de112b33 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 8 Feb 2019 10:35:20 -0800 Subject: [PATCH 065/134] BACKPORT: binder: use userspace pointer as base of buffer space Now that alloc->buffer points to the userspace vm_area rename buffer->data to buffer->user_data and rename local pointers that hold user addresses. Also use the "__user" tag to annotate all user pointers so sparse can flag cases where user pointer vaues are copied to kernel pointers. Refactor code to use offsets instead of user pointers. (cherry pick from commit bde4a19fc04f5f46298c86b1acb7a4af1d5f138d) Bug: 67668716 Change-Id: I9d04b844c5994d1f6214da795799e6b373bc9816 Signed-off-by: Todd Kjos Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 123 +++++++++++++----------- drivers/android/binder_alloc.c | 87 +++++++++-------- drivers/android/binder_alloc.h | 6 +- drivers/android/binder_alloc_selftest.c | 4 +- drivers/android/binder_trace.h | 2 +- 5 files changed, 119 insertions(+), 103 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index d9f4d330fed9..445f1d4586ec 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2402,33 +2402,30 @@ static bool binder_validate_fixup(struct binder_proc *proc, static void binder_transaction_buffer_release(struct binder_proc *proc, struct binder_buffer *buffer, - binder_size_t *failed_at) + binder_size_t failed_at, + bool is_failure) { - binder_size_t *offp, *off_start, *off_end; int debug_id = buffer->debug_id; - binder_size_t off_start_offset; + binder_size_t off_start_offset, buffer_offset, off_end_offset; binder_debug(BINDER_DEBUG_TRANSACTION, - "%d buffer release %d, size %zd-%zd, failed at %pK\n", + "%d buffer release %d, size %zd-%zd, failed at %llx\n", proc->pid, buffer->debug_id, - buffer->data_size, buffer->offsets_size, failed_at); + buffer->data_size, buffer->offsets_size, + (unsigned long long)failed_at); if (buffer->target_node) binder_dec_node(buffer->target_node, 1, 0); off_start_offset = ALIGN(buffer->data_size, sizeof(void *)); - off_start = (binder_size_t *)(buffer->data + off_start_offset); - if (failed_at) - off_end = failed_at; - else - off_end = (void *)off_start + buffer->offsets_size; - for (offp = off_start; offp < off_end; offp++) { + off_end_offset = is_failure ? failed_at : + off_start_offset + buffer->offsets_size; + for (buffer_offset = off_start_offset; buffer_offset < off_end_offset; + buffer_offset += sizeof(binder_size_t)) { struct binder_object_header *hdr; size_t object_size; struct binder_object object; binder_size_t object_offset; - binder_size_t buffer_offset = (uintptr_t)offp - - (uintptr_t)buffer->data; binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, buffer, buffer_offset, @@ -2499,16 +2496,19 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, struct binder_fd_array_object *fda; struct binder_buffer_object *parent; struct binder_object ptr_object; - u32 *fd_array; + binder_size_t fda_offset; size_t fd_index; binder_size_t fd_buf_size; + binder_size_t num_valid; + num_valid = (buffer_offset - off_start_offset) / + sizeof(binder_size_t); fda = to_binder_fd_array_object(hdr); parent = binder_validate_ptr(proc, buffer, &ptr_object, fda->parent, off_start_offset, NULL, - offp - off_start); + num_valid); if (!parent) { pr_err("transaction release %d bad parent offset", debug_id); @@ -2527,14 +2527,21 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, debug_id, (u64)fda->num_fds); continue; } - fd_array = (u32 *)(uintptr_t) - (parent->buffer + fda->parent_offset); + /* + * the source data for binder_buffer_object is visible + * to user-space and the @buffer element is the user + * pointer to the buffer_object containing the fd_array. + * Convert the address to an offset relative to + * the base of the transaction buffer. + */ + fda_offset = + (parent->buffer - (uintptr_t)buffer->user_data) + + fda->parent_offset; for (fd_index = 0; fd_index < fda->num_fds; fd_index++) { u32 fd; - binder_size_t offset = - (uintptr_t)&fd_array[fd_index] - - (uintptr_t)buffer->data; + binder_size_t offset = fda_offset + + fd_index * sizeof(fd); binder_alloc_copy_from_buffer(&proc->alloc, &fd, @@ -2739,8 +2746,8 @@ static int binder_translate_fd_array(struct binder_fd_array_object *fda, struct binder_transaction *in_reply_to) { binder_size_t fdi, fd_buf_size, num_installed_fds; + binder_size_t fda_offset; int target_fd; - u32 *fd_array; struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; @@ -2757,8 +2764,16 @@ static int binder_translate_fd_array(struct binder_fd_array_object *fda, proc->pid, thread->pid, (u64)fda->num_fds); return -EINVAL; } - fd_array = (u32 *)(uintptr_t)(parent->buffer + fda->parent_offset); - if (!IS_ALIGNED((unsigned long)fd_array, sizeof(u32))) { + /* + * the source data for binder_buffer_object is visible + * to user-space and the @buffer element is the user + * pointer to the buffer_object containing the fd_array. + * Convert the address to an offset relative to + * the base of the transaction buffer. + */ + fda_offset = (parent->buffer - (uintptr_t)t->buffer->user_data) + + fda->parent_offset; + if (!IS_ALIGNED((unsigned long)fda_offset, sizeof(u32))) { binder_user_error("%d:%d parent offset not aligned correctly.\n", proc->pid, thread->pid); return -EINVAL; @@ -2766,9 +2781,7 @@ static int binder_translate_fd_array(struct binder_fd_array_object *fda, for (fdi = 0; fdi < fda->num_fds; fdi++) { u32 fd; int target_fd; - binder_size_t offset = - (uintptr_t)&fd_array[fdi] - - (uintptr_t)t->buffer->data; + binder_size_t offset = fda_offset + fdi * sizeof(fd); binder_alloc_copy_from_buffer(&target_proc->alloc, &fd, t->buffer, @@ -2790,10 +2803,7 @@ err_translate_fd_failed: num_installed_fds = fdi; for (fdi = 0; fdi < num_installed_fds; fdi++) { u32 fd; - binder_size_t offset = - (uintptr_t)&fd_array[fdi] - - (uintptr_t)t->buffer->data; - + binder_size_t offset = fda_offset + fdi * sizeof(fd); binder_alloc_copy_from_buffer(&target_proc->alloc, &fd, t->buffer, offset, sizeof(fd)); @@ -2847,7 +2857,7 @@ static int binder_fixup_parent(struct binder_transaction *t, return -EINVAL; } buffer_offset = bp->parent_offset + - (uintptr_t)parent->buffer - (uintptr_t)b->data; + (uintptr_t)parent->buffer - (uintptr_t)b->user_data; binder_alloc_copy_to_buffer(&target_proc->alloc, b, buffer_offset, &bp->buffer, sizeof(bp->buffer)); @@ -2974,10 +2984,10 @@ static void binder_transaction(struct binder_proc *proc, int ret; struct binder_transaction *t; struct binder_work *tcomplete; - binder_size_t *offp, *off_end, *off_start; - binder_size_t off_start_offset; + binder_size_t buffer_offset = 0; + binder_size_t off_start_offset, off_end_offset; binder_size_t off_min; - u8 *sg_bufp, *sg_buf_end; + binder_size_t sg_buf_offset, sg_buf_end_offset; struct binder_proc *target_proc = NULL; struct binder_thread *target_thread = NULL; struct binder_node *target_node = NULL; @@ -3251,7 +3261,7 @@ static void binder_transaction(struct binder_proc *proc, ALIGN(extra_buffers_size, sizeof(void *)) - ALIGN(secctx_sz, sizeof(u64)); - t->security_ctx = (uintptr_t)t->buffer->data + buf_offset; + t->security_ctx = (uintptr_t)t->buffer->user_data + buf_offset; binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, buf_offset, secctx, secctx_sz); @@ -3262,9 +3272,6 @@ static void binder_transaction(struct binder_proc *proc, t->buffer->transaction = t; t->buffer->target_node = target_node; trace_binder_transaction_alloc_buf(t->buffer); - off_start_offset = ALIGN(tr->data_size, sizeof(void *)); - off_start = (binder_size_t *)(t->buffer->data + off_start_offset); - offp = off_start; if (binder_alloc_copy_user_to_buffer( &target_proc->alloc, @@ -3310,17 +3317,18 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_bad_offset; } - off_end = (void *)off_start + tr->offsets_size; - sg_bufp = (u8 *)(PTR_ALIGN(off_end, sizeof(void *))); - sg_buf_end = sg_bufp + extra_buffers_size; + off_start_offset = ALIGN(tr->data_size, sizeof(void *)); + buffer_offset = off_start_offset; + off_end_offset = off_start_offset + tr->offsets_size; + sg_buf_offset = ALIGN(off_end_offset, sizeof(void *)); + sg_buf_end_offset = sg_buf_offset + extra_buffers_size; off_min = 0; - for (; offp < off_end; offp++) { + for (buffer_offset = off_start_offset; buffer_offset < off_end_offset; + buffer_offset += sizeof(binder_size_t)) { struct binder_object_header *hdr; size_t object_size; struct binder_object object; binder_size_t object_offset; - binder_size_t buffer_offset = - (uintptr_t)offp - (uintptr_t)t->buffer->data; binder_alloc_copy_from_buffer(&target_proc->alloc, &object_offset, @@ -3399,12 +3407,14 @@ static void binder_transaction(struct binder_proc *proc, binder_size_t parent_offset; struct binder_fd_array_object *fda = to_binder_fd_array_object(hdr); + size_t num_valid = (buffer_offset - off_start_offset) * + sizeof(binder_size_t); struct binder_buffer_object *parent = binder_validate_ptr(target_proc, t->buffer, &ptr_object, fda->parent, off_start_offset, &parent_offset, - offp - off_start); + num_valid); if (!parent) { binder_user_error("%d:%d got transaction with invalid parent offset or type\n", proc->pid, thread->pid); @@ -3441,9 +3451,8 @@ static void binder_transaction(struct binder_proc *proc, case BINDER_TYPE_PTR: { struct binder_buffer_object *bp = to_binder_buffer_object(hdr); - size_t buf_left = sg_buf_end - sg_bufp; - binder_size_t sg_buf_offset = (uintptr_t)sg_bufp - - (uintptr_t)t->buffer->data; + size_t buf_left = sg_buf_end_offset - sg_buf_offset; + size_t num_valid; if (bp->length > buf_left) { binder_user_error("%d:%d got transaction with too large buffer\n", @@ -3468,12 +3477,15 @@ static void binder_transaction(struct binder_proc *proc, goto err_copy_data_failed; } /* Fixup buffer pointer to target proc address space */ - bp->buffer = (uintptr_t)sg_bufp; - sg_bufp += ALIGN(bp->length, sizeof(u64)); + bp->buffer = (uintptr_t) + t->buffer->user_data + sg_buf_offset; + sg_buf_offset += ALIGN(bp->length, sizeof(u64)); + num_valid = (buffer_offset - off_start_offset) * + sizeof(binder_size_t); ret = binder_fixup_parent(t, thread, bp, off_start_offset, - offp - off_start, + num_valid, last_fixup_obj_off, last_fixup_min_off); if (ret < 0) { @@ -3565,7 +3577,8 @@ err_bad_offset: err_bad_parent: err_copy_data_failed: trace_binder_transaction_failed_buffer_release(t->buffer); - binder_transaction_buffer_release(target_proc, t->buffer, offp); + binder_transaction_buffer_release(target_proc, t->buffer, + buffer_offset, true); if (target_node) binder_dec_node_tmpref(target_node); target_node = NULL; @@ -3844,7 +3857,7 @@ static int binder_thread_write(struct binder_proc *proc, binder_node_inner_unlock(buf_node); } trace_binder_transaction_buffer_release(buffer); - binder_transaction_buffer_release(proc, buffer, NULL); + binder_transaction_buffer_release(proc, buffer, 0, false); binder_alloc_free_buf(&proc->alloc, buffer); break; } @@ -4452,7 +4465,7 @@ retry: trd->data_size = t->buffer->data_size; trd->offsets_size = t->buffer->offsets_size; - trd->data.ptr.buffer = (uintptr_t)t->buffer->data; + trd->data.ptr.buffer = (uintptr_t)t->buffer->user_data; trd->data.ptr.offsets = trd->data.ptr.buffer + ALIGN(t->buffer->data_size, sizeof(void *)); @@ -5511,7 +5524,7 @@ static void print_binder_transaction_ilocked(struct seq_file *m, seq_printf(m, " node %d", buffer->target_node->debug_id); seq_printf(m, " size %zd:%zd data %pK\n", buffer->data_size, buffer->offsets_size, - buffer->data); + buffer->user_data); } static void print_binder_work_ilocked(struct seq_file *m, diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index d71588de38e4..648d8f8e3d68 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -67,9 +67,8 @@ static size_t binder_alloc_buffer_size(struct binder_alloc *alloc, struct binder_buffer *buffer) { if (list_is_last(&buffer->entry, &alloc->buffers)) - return (u8 *)alloc->buffer + - alloc->buffer_size - (u8 *)buffer->data; - return (u8 *)binder_buffer_next(buffer)->data - (u8 *)buffer->data; + return alloc->buffer + alloc->buffer_size - buffer->user_data; + return binder_buffer_next(buffer)->user_data - buffer->user_data; } static void binder_insert_free_buffer(struct binder_alloc *alloc, @@ -119,9 +118,9 @@ static void binder_insert_allocated_buffer_locked( buffer = rb_entry(parent, struct binder_buffer, rb_node); BUG_ON(buffer->free); - if (new_buffer->data < buffer->data) + if (new_buffer->user_data < buffer->user_data) p = &parent->rb_left; - else if (new_buffer->data > buffer->data) + else if (new_buffer->user_data > buffer->user_data) p = &parent->rb_right; else BUG(); @@ -136,17 +135,17 @@ static struct binder_buffer *binder_alloc_prepare_to_free_locked( { struct rb_node *n = alloc->allocated_buffers.rb_node; struct binder_buffer *buffer; - void *uptr; + void __user *uptr; - uptr = (void *)user_ptr; + uptr = (void __user *)user_ptr; while (n) { buffer = rb_entry(n, struct binder_buffer, rb_node); BUG_ON(buffer->free); - if (uptr < buffer->data) + if (uptr < buffer->user_data) n = n->rb_left; - else if (uptr > buffer->data) + else if (uptr > buffer->user_data) n = n->rb_right; else { /* @@ -186,9 +185,9 @@ struct binder_buffer *binder_alloc_prepare_to_free(struct binder_alloc *alloc, } static int binder_update_page_range(struct binder_alloc *alloc, int allocate, - void *start, void *end) + void __user *start, void __user *end) { - void *page_addr; + void __user *page_addr; unsigned long user_page_addr; struct binder_lru_page *page; struct vm_area_struct *vma = NULL; @@ -353,8 +352,8 @@ static struct binder_buffer *binder_alloc_new_buf_locked( struct binder_buffer *buffer; size_t buffer_size; struct rb_node *best_fit = NULL; - void *has_page_addr; - void *end_page_addr; + void __user *has_page_addr; + void __user *end_page_addr; size_t size, data_offsets_size; int ret; @@ -448,15 +447,15 @@ static struct binder_buffer *binder_alloc_new_buf_locked( "%d: binder_alloc_buf size %zd got buffer %pK size %zd\n", alloc->pid, size, buffer, buffer_size); - has_page_addr = - (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK); + has_page_addr = (void __user *) + (((uintptr_t)buffer->user_data + buffer_size) & PAGE_MASK); WARN_ON(n && buffer_size != size); end_page_addr = - (void *)PAGE_ALIGN((uintptr_t)buffer->data + size); + (void __user *)PAGE_ALIGN((uintptr_t)buffer->user_data + size); if (end_page_addr > has_page_addr) end_page_addr = has_page_addr; - ret = binder_update_page_range(alloc, 1, - (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr); + ret = binder_update_page_range(alloc, 1, (void __user *) + PAGE_ALIGN((uintptr_t)buffer->user_data), end_page_addr); if (ret) return ERR_PTR(ret); @@ -469,7 +468,7 @@ static struct binder_buffer *binder_alloc_new_buf_locked( __func__, alloc->pid); goto err_alloc_buf_struct_failed; } - new_buffer->data = (u8 *)buffer->data + size; + new_buffer->user_data = (u8 __user *)buffer->user_data + size; list_add(&new_buffer->entry, &buffer->entry); new_buffer->free = 1; binder_insert_free_buffer(alloc, new_buffer); @@ -495,8 +494,8 @@ static struct binder_buffer *binder_alloc_new_buf_locked( return buffer; err_alloc_buf_struct_failed: - binder_update_page_range(alloc, 0, - (void *)PAGE_ALIGN((uintptr_t)buffer->data), + binder_update_page_range(alloc, 0, (void __user *) + PAGE_ALIGN((uintptr_t)buffer->user_data), end_page_addr); return ERR_PTR(-ENOMEM); } @@ -531,14 +530,15 @@ struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc, return buffer; } -static void *buffer_start_page(struct binder_buffer *buffer) +static void __user *buffer_start_page(struct binder_buffer *buffer) { - return (void *)((uintptr_t)buffer->data & PAGE_MASK); + return (void __user *)((uintptr_t)buffer->user_data & PAGE_MASK); } -static void *prev_buffer_end_page(struct binder_buffer *buffer) +static void __user *prev_buffer_end_page(struct binder_buffer *buffer) { - return (void *)(((uintptr_t)(buffer->data) - 1) & PAGE_MASK); + return (void __user *) + (((uintptr_t)(buffer->user_data) - 1) & PAGE_MASK); } static void binder_delete_free_buffer(struct binder_alloc *alloc, @@ -553,7 +553,8 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc, to_free = false; binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: merge free, buffer %pK share page with %pK\n", - alloc->pid, buffer->data, prev->data); + alloc->pid, buffer->user_data, + prev->user_data); } if (!list_is_last(&buffer->entry, &alloc->buffers)) { @@ -563,23 +564,24 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc, binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: merge free, buffer %pK share page with %pK\n", alloc->pid, - buffer->data, - next->data); + buffer->user_data, + next->user_data); } } - if (PAGE_ALIGNED(buffer->data)) { + if (PAGE_ALIGNED(buffer->user_data)) { binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: merge free, buffer start %pK is page aligned\n", - alloc->pid, buffer->data); + alloc->pid, buffer->user_data); to_free = false; } if (to_free) { binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: merge free, buffer %pK do not share page with %pK or %pK\n", - alloc->pid, buffer->data, - prev->data, next ? next->data : NULL); + alloc->pid, buffer->user_data, + prev->user_data, + next ? next->user_data : NULL); binder_update_page_range(alloc, 0, buffer_start_page(buffer), buffer_start_page(buffer) + PAGE_SIZE); } @@ -605,8 +607,8 @@ static void binder_free_buf_locked(struct binder_alloc *alloc, BUG_ON(buffer->free); BUG_ON(size > buffer_size); BUG_ON(buffer->transaction != NULL); - BUG_ON(buffer->data < alloc->buffer); - BUG_ON(buffer->data > alloc->buffer + alloc->buffer_size); + BUG_ON(buffer->user_data < alloc->buffer); + BUG_ON(buffer->user_data > alloc->buffer + alloc->buffer_size); if (buffer->async_transaction) { alloc->free_async_space += size + sizeof(struct binder_buffer); @@ -617,8 +619,9 @@ static void binder_free_buf_locked(struct binder_alloc *alloc, } binder_update_page_range(alloc, 0, - (void *)PAGE_ALIGN((uintptr_t)buffer->data), - (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK)); + (void __user *)PAGE_ALIGN((uintptr_t)buffer->user_data), + (void __user *)(((uintptr_t) + buffer->user_data + buffer_size) & PAGE_MASK)); rb_erase(&buffer->rb_node, &alloc->allocated_buffers); buffer->free = 1; @@ -684,7 +687,7 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, goto err_already_mapped; } - alloc->buffer = (void *)vma->vm_start; + alloc->buffer = (void __user *)vma->vm_start; mutex_unlock(&binder_alloc_mmap_lock); alloc->pages = kzalloc(sizeof(alloc->pages[0]) * @@ -704,7 +707,7 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, goto err_alloc_buf_struct_failed; } - buffer->data = alloc->buffer; + buffer->user_data = alloc->buffer; list_add(&buffer->entry, &alloc->buffers); buffer->free = 1; binder_insert_free_buffer(alloc, buffer); @@ -763,7 +766,7 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) int i; for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { - void *page_addr; + void __user *page_addr; bool on_lru; if (!alloc->pages[i].page_ptr) @@ -794,7 +797,7 @@ static void print_binder_buffer(struct seq_file *m, const char *prefix, struct binder_buffer *buffer) { seq_printf(m, "%s %d: %pK size %zd:%zd:%zd %s\n", - prefix, buffer->debug_id, buffer->data, + prefix, buffer->debug_id, buffer->user_data, buffer->data_size, buffer->offsets_size, buffer->extra_buffers_size, buffer->transaction ? "active" : "delivered"); @@ -1045,7 +1048,7 @@ static inline bool check_buffer(struct binder_alloc *alloc, * @pgoffp: address to copy final page offset to * * Lookup the struct page corresponding to the address - * at @buffer_offset into @buffer->data. If @pgoffp is not + * at @buffer_offset into @buffer->user_data. If @pgoffp is not * NULL, the byte-offset into the page is written there. * * The caller is responsible to ensure that the offset points @@ -1062,7 +1065,7 @@ static struct page *binder_alloc_get_page(struct binder_alloc *alloc, pgoff_t *pgoffp) { binder_size_t buffer_space_offset = buffer_offset + - (buffer->data - alloc->buffer); + (buffer->user_data - alloc->buffer); pgoff_t pgoff = buffer_space_offset & ~PAGE_MASK; size_t index = buffer_space_offset >> PAGE_SHIFT; struct binder_lru_page *lru_page; diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index 1026e9fb20db..b60d161b7a7a 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -40,7 +40,7 @@ struct binder_transaction; * @data_size: size of @transaction data * @offsets_size: size of array of offsets * @extra_buffers_size: size of space for other objects (like sg lists) - * @data: pointer to base of buffer space + * @user_data: user pointer to base of buffer space * * Bookkeeping structure for binder transaction buffers */ @@ -59,7 +59,7 @@ struct binder_buffer { size_t data_size; size_t offsets_size; size_t extra_buffers_size; - void *data; + void __user *user_data; }; /** @@ -102,7 +102,7 @@ struct binder_alloc { struct mutex mutex; struct vm_area_struct *vma; struct mm_struct *vma_vm_mm; - void *buffer; + void __user *buffer; struct list_head buffers; struct rb_root free_buffers; struct rb_root allocated_buffers; diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c index 8bd7bcef967d..f0f4d7d02635 100644 --- a/drivers/android/binder_alloc_selftest.c +++ b/drivers/android/binder_alloc_selftest.c @@ -105,8 +105,8 @@ static bool check_buffer_pages_allocated(struct binder_alloc *alloc, void *page_addr, *end; int page_index; - end = (void *)PAGE_ALIGN((uintptr_t)buffer->data + size); - page_addr = buffer->data; + end = (void *)PAGE_ALIGN((uintptr_t)buffer->user_data + size); + page_addr = buffer->user_data; for (; page_addr < end; page_addr += PAGE_SIZE) { page_index = (page_addr - alloc->buffer) / PAGE_SIZE; if (!alloc->pages[page_index].page_ptr || diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h index b11dffc521e8..7674231af8cb 100644 --- a/drivers/android/binder_trace.h +++ b/drivers/android/binder_trace.h @@ -296,7 +296,7 @@ DEFINE_EVENT(binder_buffer_class, binder_transaction_failed_buffer_release, TRACE_EVENT(binder_update_page_range, TP_PROTO(struct binder_alloc *alloc, bool allocate, - void *start, void *end), + void __user *start, void __user *end), TP_ARGS(alloc, allocate, start, end), TP_STRUCT__entry( __field(int, proc) From 91bfaa07a2897825be8b5c2004abac596767f5d8 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 13 Feb 2019 11:48:53 -0800 Subject: [PATCH 066/134] UPSTREAM: binder: fix sparse issue in binder_alloc_selftest.c Fixes sparse issues reported by the kbuild test robot running on https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git char-misc-testing: bde4a19fc04f5 ("binder: use userspace pointer as base of buffer space") Error output (drivers/android/binder_alloc_selftest.c): sparse: warning: incorrect type in assignment (different address spaces) sparse: expected void *page_addr sparse: got void [noderef] *user_data sparse: error: subtraction of different types can't work Fixed by adding necessary "__user" tags. (cherry pick from commit 36f30937922ce75390c73f99e650e4f2eb56b0e6) Bug: 67668716 Reported-by: kbuild test robot Signed-off-by: Todd Kjos Signed-off-by: Greg Kroah-Hartman Change-Id: Ia0a16d163251381d4bc04f46a44dddbc18b10a85 --- drivers/android/binder_alloc_selftest.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c index f0f4d7d02635..b72708918b06 100644 --- a/drivers/android/binder_alloc_selftest.c +++ b/drivers/android/binder_alloc_selftest.c @@ -102,10 +102,11 @@ static bool check_buffer_pages_allocated(struct binder_alloc *alloc, struct binder_buffer *buffer, size_t size) { - void *page_addr, *end; + void __user *page_addr; + void __user *end; int page_index; - end = (void *)PAGE_ALIGN((uintptr_t)buffer->user_data + size); + end = (void __user *)PAGE_ALIGN((uintptr_t)buffer->user_data + size); page_addr = buffer->user_data; for (; page_addr < end; page_addr += PAGE_SIZE) { page_index = (page_addr - alloc->buffer) / PAGE_SIZE; From d8382216065b71035f7f8e274c6cdbe7c9f828e1 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Thu, 14 Feb 2019 15:22:57 -0800 Subject: [PATCH 067/134] UPSTREAM: binder: fix handling of misaligned binder object Fixes crash found by syzbot: kernel BUG at drivers/android/binder_alloc.c:LINE! (2) (cherry pick from commit 26528be6720bb40bc8844e97ee73a37e530e9c5e) Bug: 67668716 Reported-and-tested-by: syzbot+55de1eb4975dec156d8f@syzkaller.appspotmail.com Signed-off-by: Todd Kjos Signed-off-by: Greg Kroah-Hartman Change-Id: Ib8597dd05a158f78503d4affe6c5f46ded16a811 --- drivers/android/binder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 445f1d4586ec..94e2872cff03 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2239,7 +2239,7 @@ static size_t binder_get_object(struct binder_proc *proc, size_t object_size = 0; read_size = min_t(size_t, sizeof(*object), buffer->data_size - offset); - if (read_size < sizeof(*hdr)) + if (read_size < sizeof(*hdr) || !IS_ALIGNED(offset, sizeof(u32))) return 0; binder_alloc_copy_from_buffer(&proc->alloc, object, buffer, offset, read_size); From 7df8ba42237b91392cf7d0a5d9a59caa400b5fe8 Mon Sep 17 00:00:00 2001 From: Paul Lawrence Date: Thu, 21 Mar 2019 13:23:03 -0700 Subject: [PATCH 068/134] ANDROID: Add dm-bow to cuttlefish configuration Change-Id: I7f265cb8c6274da414d2477da9953546510ce26b Signed-off-by: Paul Lawrence --- arch/arm64/configs/cuttlefish_defconfig | 1 + arch/x86/configs/x86_64_cuttlefish_defconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/arm64/configs/cuttlefish_defconfig b/arch/arm64/configs/cuttlefish_defconfig index e6c3dcad412e..856f1f4991ed 100644 --- a/arch/arm64/configs/cuttlefish_defconfig +++ b/arch/arm64/configs/cuttlefish_defconfig @@ -217,6 +217,7 @@ CONFIG_DM_UEVENT=y CONFIG_DM_VERITY=y CONFIG_DM_VERITY_FEC=y CONFIG_DM_VERITY_AVB=y +CONFIG_DM_BOW=y CONFIG_NETDEVICES=y CONFIG_NETCONSOLE=y CONFIG_NETCONSOLE_DYNAMIC=y diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig index ec2686d8abc2..9c542d0eb2a8 100644 --- a/arch/x86/configs/x86_64_cuttlefish_defconfig +++ b/arch/x86/configs/x86_64_cuttlefish_defconfig @@ -230,6 +230,7 @@ CONFIG_DM_UEVENT=y CONFIG_DM_VERITY=y CONFIG_DM_VERITY_FEC=y CONFIG_DM_ANDROID_VERITY=y +CONFIG_DM_BOW=y CONFIG_NETDEVICES=y CONFIG_NETCONSOLE=y CONFIG_NETCONSOLE_DYNAMIC=y From db689dd813b721e893a0d32f1ee8a75f4d9efb3e Mon Sep 17 00:00:00 2001 From: Paul Lawrence Date: Tue, 26 Mar 2019 09:05:55 -0700 Subject: [PATCH 069/134] ANDROID: dm-bow: Fix 32 bit compile errors See https://www.kernel.org/doc/html/v4.17/core-api/printk-formats.html Also 64-bit modulus not defined on 32-bit architectures Test: i386_defconfig and x86_64_cuttlefish_defconfig compile Change-Id: I57b9372e12e97b9a18232191b525e7601bc57a24 Signed-off-by: Paul Lawrence --- drivers/md/dm-bow.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-bow.c b/drivers/md/dm-bow.c index fa0675d58853..b92da30a3d42 100644 --- a/drivers/md/dm-bow.c +++ b/drivers/md/dm-bow.c @@ -267,7 +267,8 @@ static struct bow_range *find_free_range(struct bow_context *bc) static sector_t sector_to_page(struct bow_context const *bc, sector_t sector) { - WARN_ON(sector % (bc->block_size / SECTOR_SIZE) != 0); + WARN_ON((sector & (((sector_t)1 << (bc->block_shift - SECTOR_SHIFT)) - 1)) + != 0); return sector >> (bc->block_shift - SECTOR_SHIFT); } @@ -292,7 +293,8 @@ static int copy_data(struct bow_context const *bc, read = dm_bufio_read(bc->bufio, page, &read_buffer); if (IS_ERR(read)) { - DMERR("Cannot read page %lu", page); + DMERR("Cannot read page %llu", + (unsigned long long)page); return PTR_ERR(read); } @@ -953,8 +955,9 @@ static int add_trim(struct bow_context *bc, struct bio *bio) struct bow_range *br; struct bvec_iter bi_iter = bio->bi_iter; - DMDEBUG("add_trim: %lu, %u", - bio->bi_iter.bi_sector, bio->bi_iter.bi_size); + DMDEBUG("add_trim: %llu, %u", + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size); do { br = find_first_overlapping_range(&bc->ranges, &bi_iter); @@ -991,8 +994,9 @@ static int remove_trim(struct bow_context *bc, struct bio *bio) struct bow_range *br; struct bvec_iter bi_iter = bio->bi_iter; - DMDEBUG("remove_trim: %lu, %u", - bio->bi_iter.bi_sector, bio->bi_iter.bi_size); + DMDEBUG("remove_trim: %llu, %u", + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size); do { br = find_first_overlapping_range(&bc->ranges, &bi_iter); @@ -1117,8 +1121,9 @@ static void dm_bow_tablestatus(struct dm_target *ti, char *result, for (i = rb_first(&bc->ranges); i; i = rb_next(i)) { struct bow_range *br = container_of(i, struct bow_range, node); - result += scnprintf(result, end - result, "%s: %lu", - readable_type[br->type], br->sector); + result += scnprintf(result, end - result, "%s: %llu", + readable_type[br->type], + (unsigned long long)br->sector); if (result >= end) return; From 3ce190bbca8cbacf3796aacad2f2f441725799d7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 7 Mar 2019 11:09:19 +0100 Subject: [PATCH 070/134] mmc: pxamci: fix enum type confusion commit e60a582bcde01158a64ff948fb799f21f5d31a11 upstream. clang points out several instances of mismatched types in this drivers, all coming from a single declaration: drivers/mmc/host/pxamci.c:193:15: error: implicit conversion from enumeration type 'enum dma_transfer_direction' to different enumeration type 'enum dma_data_direction' [-Werror,-Wenum-conversion] direction = DMA_DEV_TO_MEM; ~ ^~~~~~~~~~~~~~ drivers/mmc/host/pxamci.c:212:62: error: implicit conversion from enumeration type 'enum dma_data_direction' to different enumeration type 'enum dma_transfer_direction' [-Werror,-Wenum-conversion] tx = dmaengine_prep_slave_sg(chan, data->sg, host->dma_len, direction, The behavior is correct, so this must be a simply typo from dma_data_direction and dma_transfer_direction being similarly named types with a similar purpose. Fixes: 6464b7140951 ("mmc: pxamci: switch over to dmaengine use") Signed-off-by: Arnd Bergmann Reviewed-by: Nathan Chancellor Acked-by: Robert Jarzmik Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/pxamci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c index c763b404510f..3e139692fe8f 100644 --- a/drivers/mmc/host/pxamci.c +++ b/drivers/mmc/host/pxamci.c @@ -181,7 +181,7 @@ static void pxamci_dma_irq(void *param); static void pxamci_setup_data(struct pxamci_host *host, struct mmc_data *data) { struct dma_async_tx_descriptor *tx; - enum dma_data_direction direction; + enum dma_transfer_direction direction; struct dma_slave_config config; struct dma_chan *chan; unsigned int nob = data->blocks; From 47248fde59a60a31a55d489f4851836f0ac94295 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 18 Mar 2019 15:47:58 +0100 Subject: [PATCH 071/134] drm/vmwgfx: Don't double-free the mode stored in par->set_mode commit c2d311553855395764e2e5bf401d987ba65c2056 upstream. When calling vmw_fb_set_par(), the mode stored in par->set_mode gets free'd twice. The first free is in vmw_fb_kms_detach(), the second is near the end of vmw_fb_set_par() under the name of 'old_mode'. The mode-setting code only works correctly if the mode doesn't actually change. Removing 'old_mode' in favor of using par->set_mode directly fixes the problem. Cc: Fixes: a278724aa23c ("drm/vmwgfx: Implement fbdev on kms v2") Signed-off-by: Thomas Zimmermann Reviewed-by: Deepak Rawat Signed-off-by: Thomas Hellstrom Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/vmwgfx/vmwgfx_fb.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c index d23a18aae476..3ba9b6ad0281 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c @@ -588,11 +588,9 @@ static int vmw_fb_set_par(struct fb_info *info) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }; - struct drm_display_mode *old_mode; struct drm_display_mode *mode; int ret; - old_mode = par->set_mode; mode = drm_mode_duplicate(vmw_priv->dev, &new_mode); if (!mode) { DRM_ERROR("Could not create new fb mode.\n"); @@ -603,11 +601,7 @@ static int vmw_fb_set_par(struct fb_info *info) mode->vdisplay = var->yres; vmw_guess_mode_timing(mode); - if (old_mode && drm_mode_equal(old_mode, mode)) { - drm_mode_destroy(vmw_priv->dev, mode); - mode = old_mode; - old_mode = NULL; - } else if (!vmw_kms_validate_mode_vram(vmw_priv, + if (!vmw_kms_validate_mode_vram(vmw_priv, mode->hdisplay * DIV_ROUND_UP(var->bits_per_pixel, 8), mode->vdisplay)) { @@ -677,8 +671,8 @@ static int vmw_fb_set_par(struct fb_info *info) schedule_delayed_work(&par->local_work, 0); out_unlock: - if (old_mode) - drm_mode_destroy(vmw_priv->dev, old_mode); + if (par->set_mode) + drm_mode_destroy(vmw_priv->dev, par->set_mode); par->set_mode = mode; drm_modeset_unlock_all(vmw_priv->dev); From 1e7b9a3143ad2a24c909351acecfdeafe17d0a9d Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 13 Mar 2019 10:03:17 +0100 Subject: [PATCH 072/134] iommu/amd: fix sg->dma_address for sg->offset bigger than PAGE_SIZE commit 4e50ce03976fbc8ae995a000c4b10c737467beaa upstream. Take into account that sg->offset can be bigger than PAGE_SIZE when setting segment sg->dma_address. Otherwise sg->dma_address will point at diffrent page, what makes DMA not possible with erros like this: xhci_hcd 0000:38:00.3: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000 address=0x00000000fdaa70c0 flags=0x0020] xhci_hcd 0000:38:00.3: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000 address=0x00000000fdaa7040 flags=0x0020] xhci_hcd 0000:38:00.3: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000 address=0x00000000fdaa7080 flags=0x0020] xhci_hcd 0000:38:00.3: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000 address=0x00000000fdaa7100 flags=0x0020] xhci_hcd 0000:38:00.3: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000 address=0x00000000fdaa7000 flags=0x0020] Additinally with wrong sg->dma_address unmap_sg will free wrong pages, what what can cause crashes like this: Feb 28 19:27:45 kernel: BUG: Bad page state in process cinnamon pfn:39e8b1 Feb 28 19:27:45 kernel: Disabling lock debugging due to kernel taint Feb 28 19:27:45 kernel: flags: 0x2ffff0000000000() Feb 28 19:27:45 kernel: raw: 02ffff0000000000 0000000000000000 ffffffff00000301 0000000000000000 Feb 28 19:27:45 kernel: raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000 Feb 28 19:27:45 kernel: page dumped because: nonzero _refcount Feb 28 19:27:45 kernel: Modules linked in: ccm fuse arc4 nct6775 hwmon_vid amdgpu nls_iso8859_1 nls_cp437 edac_mce_amd vfat fat kvm_amd ccp rng_core kvm mt76x0u mt76x0_common mt76x02_usb irqbypass mt76_usb mt76x02_lib mt76 crct10dif_pclmul crc32_pclmul chash mac80211 amd_iommu_v2 ghash_clmulni_intel gpu_sched i2c_algo_bit ttm wmi_bmof snd_hda_codec_realtek snd_hda_codec_generic drm_kms_helper snd_hda_codec_hdmi snd_hda_intel drm snd_hda_codec aesni_intel snd_hda_core snd_hwdep aes_x86_64 crypto_simd snd_pcm cfg80211 cryptd mousedev snd_timer glue_helper pcspkr r8169 input_leds realtek agpgart libphy rfkill snd syscopyarea sysfillrect sysimgblt fb_sys_fops soundcore sp5100_tco k10temp i2c_piix4 wmi evdev gpio_amdpt pinctrl_amd mac_hid pcc_cpufreq acpi_cpufreq sg ip_tables x_tables ext4(E) crc32c_generic(E) crc16(E) mbcache(E) jbd2(E) fscrypto(E) sd_mod(E) hid_generic(E) usbhid(E) hid(E) dm_mod(E) serio_raw(E) atkbd(E) libps2(E) crc32c_intel(E) ahci(E) libahci(E) libata(E) xhci_pci(E) xhci_hcd(E) Feb 28 19:27:45 kernel: scsi_mod(E) i8042(E) serio(E) bcache(E) crc64(E) Feb 28 19:27:45 kernel: CPU: 2 PID: 896 Comm: cinnamon Tainted: G B W E 4.20.12-arch1-1-custom #1 Feb 28 19:27:45 kernel: Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./B450M Pro4, BIOS P1.20 06/26/2018 Feb 28 19:27:45 kernel: Call Trace: Feb 28 19:27:45 kernel: dump_stack+0x5c/0x80 Feb 28 19:27:45 kernel: bad_page.cold.29+0x7f/0xb2 Feb 28 19:27:45 kernel: __free_pages_ok+0x2c0/0x2d0 Feb 28 19:27:45 kernel: skb_release_data+0x96/0x180 Feb 28 19:27:45 kernel: __kfree_skb+0xe/0x20 Feb 28 19:27:45 kernel: tcp_recvmsg+0x894/0xc60 Feb 28 19:27:45 kernel: ? reuse_swap_page+0x120/0x340 Feb 28 19:27:45 kernel: ? ptep_set_access_flags+0x23/0x30 Feb 28 19:27:45 kernel: inet_recvmsg+0x5b/0x100 Feb 28 19:27:45 kernel: __sys_recvfrom+0xc3/0x180 Feb 28 19:27:45 kernel: ? handle_mm_fault+0x10a/0x250 Feb 28 19:27:45 kernel: ? syscall_trace_enter+0x1d3/0x2d0 Feb 28 19:27:45 kernel: ? __audit_syscall_exit+0x22a/0x290 Feb 28 19:27:45 kernel: __x64_sys_recvfrom+0x24/0x30 Feb 28 19:27:45 kernel: do_syscall_64+0x5b/0x170 Feb 28 19:27:45 kernel: entry_SYSCALL_64_after_hwframe+0x44/0xa9 Cc: stable@vger.kernel.org Reported-and-tested-by: Jan Viktorin Reviewed-by: Alexander Duyck Signed-off-by: Stanislaw Gruszka Fixes: 80187fd39dcb ('iommu/amd: Optimize map_sg and unmap_sg') Signed-off-by: Joerg Roedel Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/amd_iommu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 78b97f31a1f2..bd339bfe0d15 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2548,7 +2548,12 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, /* Everything is mapped - write the right values into s->dma_address */ for_each_sg(sglist, s, nelems, i) { - s->dma_address += address + s->offset; + /* + * Add in the remaining piece of the scatter-gather offset that + * was masked out when we were determining the physical address + * via (sg_phys(s) & PAGE_MASK) earlier. + */ + s->dma_address += address + (s->offset & ~PAGE_MASK); s->dma_length = s->length; } From 2e5522ad5c1cca8848525721643f824cc651ca16 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 20 Mar 2019 09:46:58 +0100 Subject: [PATCH 073/134] libceph: wait for latest osdmap in ceph_monc_blacklist_add() commit bb229bbb3bf63d23128e851a1f3b85c083178fa1 upstream. Because map updates are distributed lazily, an OSD may not know about the new blacklist for quite some time after "osd blacklist add" command is completed. This makes it possible for a blacklisted but still alive client to overwrite a post-blacklist update, resulting in data corruption. Waiting for latest osdmap in ceph_monc_blacklist_add() and thus using the post-blacklist epoch for all post-blacklist requests ensures that all such requests "wait" for the blacklist to come into force on their respective OSDs. Cc: stable@vger.kernel.org Fixes: 6305a3b41515 ("libceph: support for blacklisting clients") Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman Signed-off-by: Greg Kroah-Hartman --- include/linux/ceph/libceph.h | 2 ++ net/ceph/ceph_common.c | 18 +++++++++++++++++- net/ceph/mon_client.c | 9 +++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index d3b04f9589a9..c311cd13ea7d 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -291,6 +291,8 @@ extern void ceph_destroy_client(struct ceph_client *client); extern int __ceph_open_session(struct ceph_client *client, unsigned long started); extern int ceph_open_session(struct ceph_client *client); +int ceph_wait_for_latest_osdmap(struct ceph_client *client, + unsigned long timeout); /* pagevec.c */ extern void ceph_release_page_vector(struct page **pages, int num_pages); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index cdb5b693a135..1001377ae428 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -720,7 +720,6 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started) } EXPORT_SYMBOL(__ceph_open_session); - int ceph_open_session(struct ceph_client *client) { int ret; @@ -736,6 +735,23 @@ int ceph_open_session(struct ceph_client *client) } EXPORT_SYMBOL(ceph_open_session); +int ceph_wait_for_latest_osdmap(struct ceph_client *client, + unsigned long timeout) +{ + u64 newest_epoch; + int ret; + + ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch); + if (ret) + return ret; + + if (client->osdc.osdmap->epoch >= newest_epoch) + return 0; + + ceph_osdc_maybe_request_map(&client->osdc); + return ceph_monc_wait_osdmap(&client->monc, newest_epoch, timeout); +} +EXPORT_SYMBOL(ceph_wait_for_latest_osdmap); static int __init init_ceph_lib(void) { diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index f14498a7eaec..daca0af59942 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -922,6 +922,15 @@ int ceph_monc_blacklist_add(struct ceph_mon_client *monc, mutex_unlock(&monc->mutex); ret = wait_generic_request(req); + if (!ret) + /* + * Make sure we have the osdmap that includes the blacklist + * entry. This is needed to ensure that the OSDs pick up the + * new blacklist before processing any future requests from + * this client. + */ + ret = ceph_wait_for_latest_osdmap(monc->client, 0); + out: put_generic_request(req); return ret; From 6a502107f716ba502d681d23afa779a28b83ca0f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 11 Mar 2019 15:04:18 +0100 Subject: [PATCH 074/134] udf: Fix crash on IO error during truncate commit d3ca4651d05c0ff7259d087d8c949bcf3e14fb46 upstream. When truncate(2) hits IO error when reading indirect extent block the code just bugs with: kernel BUG at linux-4.15.0/fs/udf/truncate.c:249! ... Fix the problem by bailing out cleanly in case of IO error. CC: stable@vger.kernel.org Reported-by: jean-luc malet Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/udf/truncate.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 42b8c57795cb..c6ce7503a329 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -260,6 +260,9 @@ void udf_truncate_extents(struct inode *inode) epos.block = eloc; epos.bh = udf_tread(sb, udf_get_lb_pblock(sb, &eloc, 0)); + /* Error reading indirect block? */ + if (!epos.bh) + return; if (elen) indirect_ext_len = (elen + sb->s_blocksize - 1) >> From a3c6248c8a2ea18a151da77ea09bbbf7dc96ce59 Mon Sep 17 00:00:00 2001 From: Yifeng Li Date: Tue, 5 Mar 2019 06:00:22 +0800 Subject: [PATCH 075/134] mips: loongson64: lemote-2f: Add IRQF_NO_SUSPEND to "cascade" irqaction. commit 5f5f67da9781770df0403269bc57d7aae608fecd upstream. Timekeeping IRQs from CS5536 MFGPT are routed to i8259, which then triggers the "cascade" IRQ on MIPS CPU. Without IRQF_NO_SUSPEND in cascade_irqaction, MFGPT interrupts will be masked in suspend mode, and the machine would be unable to resume once suspended. Previously, MIPS IRQs were not disabled properly, so the original code appeared to work. Commit a3e6c1eff5 ("MIPS: IRQ: Fix disable_irq on CPU IRQs") uncovers the bug. To fix it, add IRQF_NO_SUSPEND to cascade_irqaction. This commit is functionally identical to 0add9c2f1cff ("MIPS: Loongson-3: Add IRQF_NO_SUSPEND to Cascade irqaction"), but it forgot to apply the same fix to Loongson2. Signed-off-by: Yifeng Li Signed-off-by: Paul Burton Cc: linux-mips@vger.kernel.org Cc: Jiaxun Yang Cc: Huacai Chen Cc: Ralf Baechle Cc: James Hogan Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v3.19+ Signed-off-by: Greg Kroah-Hartman --- arch/mips/loongson64/lemote-2f/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/loongson64/lemote-2f/irq.c b/arch/mips/loongson64/lemote-2f/irq.c index 9e33e45aa17c..b213cecb8e3a 100644 --- a/arch/mips/loongson64/lemote-2f/irq.c +++ b/arch/mips/loongson64/lemote-2f/irq.c @@ -103,7 +103,7 @@ static struct irqaction ip6_irqaction = { static struct irqaction cascade_irqaction = { .handler = no_action, .name = "cascade", - .flags = IRQF_NO_THREAD, + .flags = IRQF_NO_THREAD | IRQF_NO_SUSPEND, }; void __init mach_init_irq(void) From 005b0a33163e4436e0d80d24d35d38f782fc80a7 Mon Sep 17 00:00:00 2001 From: Yasha Cherikovsky Date: Fri, 8 Mar 2019 14:58:51 +0200 Subject: [PATCH 076/134] MIPS: Ensure ELF appended dtb is relocated commit 3f0a53bc6482fb09770982a8447981260ea258dc upstream. This fixes booting with the combination of CONFIG_RELOCATABLE=y and CONFIG_MIPS_ELF_APPENDED_DTB=y. Sections that appear after the relocation table are not relocated on system boot (except .bss, which has special handling). With CONFIG_MIPS_ELF_APPENDED_DTB, the dtb is part of the vmlinux ELF, so it must be relocated together with everything else. Fixes: 069fd766271d ("MIPS: Reserve space for relocation table") Signed-off-by: Yasha Cherikovsky Signed-off-by: Paul Burton Cc: Ralf Baechle Cc: Paul Burton Cc: James Hogan Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v4.7+ Signed-off-by: Greg Kroah-Hartman --- arch/mips/kernel/vmlinux.lds.S | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 971a504001c2..36f2e860ba3e 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -140,6 +140,13 @@ SECTIONS PERCPU_SECTION(1 << CONFIG_MIPS_L1_CACHE_SHIFT) #endif +#ifdef CONFIG_MIPS_ELF_APPENDED_DTB + .appended_dtb : AT(ADDR(.appended_dtb) - LOAD_OFFSET) { + *(.appended_dtb) + KEEP(*(.appended_dtb)) + } +#endif + #ifdef CONFIG_RELOCATABLE . = ALIGN(4); @@ -164,11 +171,6 @@ SECTIONS __appended_dtb = .; /* leave space for appended DTB */ . += 0x100000; -#elif defined(CONFIG_MIPS_ELF_APPENDED_DTB) - .appended_dtb : AT(ADDR(.appended_dtb) - LOAD_OFFSET) { - *(.appended_dtb) - KEEP(*(.appended_dtb)) - } #endif /* * Align to 64K in attempt to eliminate holes before the From 88870813ee29e346b5e831987f677060fda384f9 Mon Sep 17 00:00:00 2001 From: Archer Yan Date: Fri, 8 Mar 2019 03:29:19 +0000 Subject: [PATCH 077/134] MIPS: Fix kernel crash for R6 in jump label branch function commit 47c25036b60f27b86ab44b66a8861bcf81cde39b upstream. Insert Branch instruction instead of NOP to make sure assembler don't patch code in forbidden slot. In jump label function, it might be possible to patch Control Transfer Instructions(CTIs) into forbidden slot, which will generate Reserved Instruction exception in MIPS release 6. Signed-off-by: Archer Yan Reviewed-by: Paul Burton [paul.burton@mips.com: - Add MIPS prefix to subject. - Mark for stable from v4.0, which introduced r6 support, onwards.] Signed-off-by: Paul Burton Cc: linux-mips@vger.kernel.org Cc: stable@vger.kernel.org # v4.0+ Signed-off-by: Greg Kroah-Hartman --- arch/mips/include/asm/jump_label.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h index e77672539e8e..e4456e450f94 100644 --- a/arch/mips/include/asm/jump_label.h +++ b/arch/mips/include/asm/jump_label.h @@ -21,15 +21,15 @@ #endif #ifdef CONFIG_CPU_MICROMIPS -#define NOP_INSN "nop32" +#define B_INSN "b32" #else -#define NOP_INSN "nop" +#define B_INSN "b" #endif static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\t" NOP_INSN "\n\t" - "nop\n\t" + asm_volatile_goto("1:\t" B_INSN " 2f\n\t" + "2:\tnop\n\t" ".pushsection __jump_table, \"aw\"\n\t" WORD_INSN " 1b, %l[l_yes], %0\n\t" ".popsection\n\t" From 9d00ccc555ff606d6a9fc1473447946962fb54f0 Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Wed, 20 Mar 2019 13:41:50 -0500 Subject: [PATCH 078/134] scsi: ibmvscsi: Protect ibmvscsi_head from concurrent modificaiton commit 7205981e045e752ccf96cf6ddd703a98c59d4339 upstream. For each ibmvscsi host created during a probe or destroyed during a remove we either add or remove that host to/from the global ibmvscsi_head list. This runs the risk of concurrent modification. This patch adds a simple spinlock around the list modification calls to prevent concurrent updates as is done similarly in the ibmvfc driver and ipr driver. Fixes: 32d6e4b6e4ea ("scsi: ibmvscsi: add vscsi hosts to global list_head") Cc: # v4.10+ Signed-off-by: Tyrel Datwyler Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/ibmvscsi/ibmvscsi.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c index 53eb27731373..213c565f4bbf 100644 --- a/drivers/scsi/ibmvscsi/ibmvscsi.c +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c @@ -96,6 +96,7 @@ static int client_reserve = 1; static char partition_name[96] = "UNKNOWN"; static unsigned int partition_number = -1; static LIST_HEAD(ibmvscsi_head); +static DEFINE_SPINLOCK(ibmvscsi_driver_lock); static struct scsi_transport_template *ibmvscsi_transport_template; @@ -2274,7 +2275,9 @@ static int ibmvscsi_probe(struct vio_dev *vdev, const struct vio_device_id *id) } dev_set_drvdata(&vdev->dev, hostdata); + spin_lock(&ibmvscsi_driver_lock); list_add_tail(&hostdata->host_list, &ibmvscsi_head); + spin_unlock(&ibmvscsi_driver_lock); return 0; add_srp_port_failed: @@ -2296,7 +2299,9 @@ static int ibmvscsi_probe(struct vio_dev *vdev, const struct vio_device_id *id) static int ibmvscsi_remove(struct vio_dev *vdev) { struct ibmvscsi_host_data *hostdata = dev_get_drvdata(&vdev->dev); + spin_lock(&ibmvscsi_driver_lock); list_del(&hostdata->host_list); + spin_unlock(&ibmvscsi_driver_lock); unmap_persist_bufs(hostdata); release_event_pool(&hostdata->pool, hostdata); ibmvscsi_release_crq_queue(&hostdata->queue, hostdata, From 95712a194a80d54817a5edaf54cf761e002e6d55 Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Wed, 20 Mar 2019 13:41:51 -0500 Subject: [PATCH 079/134] scsi: ibmvscsi: Fix empty event pool access during host removal commit 7f5203c13ba8a7b7f9f6ecfe5a4d5567188d7835 upstream. The event pool used for queueing commands is destroyed fairly early in the ibmvscsi_remove() code path. Since, this happens prior to the call so scsi_remove_host() it is possible for further calls to queuecommand to be processed which manifest as a panic due to a NULL pointer dereference as seen here: PANIC: "Unable to handle kernel paging request for data at address 0x00000000" Context process backtrace: DSISR: 0000000042000000 ????Syscall Result: 0000000000000000 4 [c000000002cb3820] memcpy_power7 at c000000000064204 [Link Register] [c000000002cb3820] ibmvscsi_send_srp_event at d000000003ed14a4 5 [c000000002cb3920] ibmvscsi_send_srp_event at d000000003ed14a4 [ibmvscsi] ?(unreliable) 6 [c000000002cb39c0] ibmvscsi_queuecommand at d000000003ed2388 [ibmvscsi] 7 [c000000002cb3a70] scsi_dispatch_cmd at d00000000395c2d8 [scsi_mod] 8 [c000000002cb3af0] scsi_request_fn at d00000000395ef88 [scsi_mod] 9 [c000000002cb3be0] __blk_run_queue at c000000000429860 10 [c000000002cb3c10] blk_delay_work at c00000000042a0ec 11 [c000000002cb3c40] process_one_work at c0000000000dac30 12 [c000000002cb3cd0] worker_thread at c0000000000db110 13 [c000000002cb3d80] kthread at c0000000000e3378 14 [c000000002cb3e30] ret_from_kernel_thread at c00000000000982c The kernel buffer log is overfilled with this log: [11261.952732] ibmvscsi: found no event struct in pool! This patch reorders the operations during host teardown. Start by calling the SRP transport and Scsi_Host remove functions to flush any outstanding work and set the host offline. LLDD teardown follows including destruction of the event pool, freeing the Command Response Queue (CRQ), and unmapping any persistent buffers. The event pool destruction is protected by the scsi_host lock, and the pool is purged prior of any requests for which we never received a response. Finally, move the removal of the scsi host from our global list to the end so that the host is easily locatable for debugging purposes during teardown. Cc: # v2.6.12+ Signed-off-by: Tyrel Datwyler Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/ibmvscsi/ibmvscsi.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c index 213c565f4bbf..07c23bbd968c 100644 --- a/drivers/scsi/ibmvscsi/ibmvscsi.c +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c @@ -2299,17 +2299,27 @@ static int ibmvscsi_probe(struct vio_dev *vdev, const struct vio_device_id *id) static int ibmvscsi_remove(struct vio_dev *vdev) { struct ibmvscsi_host_data *hostdata = dev_get_drvdata(&vdev->dev); - spin_lock(&ibmvscsi_driver_lock); - list_del(&hostdata->host_list); - spin_unlock(&ibmvscsi_driver_lock); - unmap_persist_bufs(hostdata); + unsigned long flags; + + srp_remove_host(hostdata->host); + scsi_remove_host(hostdata->host); + + purge_requests(hostdata, DID_ERROR); + + spin_lock_irqsave(hostdata->host->host_lock, flags); release_event_pool(&hostdata->pool, hostdata); + spin_unlock_irqrestore(hostdata->host->host_lock, flags); + ibmvscsi_release_crq_queue(&hostdata->queue, hostdata, max_events); kthread_stop(hostdata->work_thread); - srp_remove_host(hostdata->host); - scsi_remove_host(hostdata->host); + unmap_persist_bufs(hostdata); + + spin_lock(&ibmvscsi_driver_lock); + list_del(&hostdata->host_list); + spin_unlock(&ibmvscsi_driver_lock); + scsi_host_put(hostdata->host); return 0; From a7e830047886221d314096183159cd52fc1d7a31 Mon Sep 17 00:00:00 2001 From: Chen Jie Date: Fri, 15 Mar 2019 03:44:38 +0000 Subject: [PATCH 080/134] futex: Ensure that futex address is aligned in handle_futex_death() commit 5a07168d8d89b00fe1760120714378175b3ef992 upstream. The futex code requires that the user space addresses of futexes are 32bit aligned. sys_futex() checks this in futex_get_keys() but the robust list code has no alignment check in place. As a consequence the kernel crashes on architectures with strict alignment requirements in handle_futex_death() when trying to cmpxchg() on an unaligned futex address which was retrieved from the robust list. [ tglx: Rewrote changelog, proper sizeof() based alignement check and add comment ] Fixes: 0771dfefc9e5 ("[PATCH] lightweight robust futexes: core") Signed-off-by: Chen Jie Signed-off-by: Thomas Gleixner Cc: Cc: Cc: Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1552621478-119787-1-git-send-email-chenjie6@huawei.com Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/futex.c b/kernel/futex.c index 22f83064abb3..f2fa48c6c476 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3450,6 +3450,10 @@ int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) { u32 uval, uninitialized_var(nval), mval; + /* Futex address must be 32bit aligned */ + if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) + return -1; + retry: if (get_user(uval, uaddr)) return -1; From 060c8899d4f992f35a5a0e9306e07da4160a270e Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 4 Mar 2019 15:13:21 +0200 Subject: [PATCH 081/134] perf probe: Fix getting the kernel map commit eaeffeb9838a7c0dec981d258666bfcc0fa6a947 upstream. Since commit 4d99e4136580 ("perf machine: Workaround missing maps for x86 PTI entry trampolines"), perf tools has been creating more than one kernel map, however 'perf probe' assumed there could be only one. Fix by using machine__kernel_map() to get the main kernel map. Signed-off-by: Adrian Hunter Tested-by: Joseph Qi Acked-by: Masami Hiramatsu Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Greg Kroah-Hartman Cc: Jiufei Xue Cc: Peter Zijlstra Cc: stable@vger.kernel.org Cc: Xu Yu Fixes: 4d99e4136580 ("perf machine: Workaround missing maps for x86 PTI entry trampolines") Fixes: d83212d5dd67 ("kallsyms, x86: Export addresses of PTI entry trampolines") Link: http://lkml.kernel.org/r/2ed432de-e904-85d2-5c36-5897ddc5b23b@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman --- tools/perf/util/probe-event.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 68786bb7790e..6670e12a2bb3 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -169,8 +169,10 @@ static struct map *kernel_get_module_map(const char *module) if (module && strchr(module, '/')) return dso__new_map(module); - if (!module) - module = "kernel"; + if (!module) { + pos = machine__kernel_map(host_machine); + return map__get(pos); + } for (pos = maps__first(maps); pos; pos = map__next(pos)) { /* short_name is "[module]" */ From d431ba69113dd23256b5f23d98c06ceb5ef5f056 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Mar 2019 19:09:38 -0500 Subject: [PATCH 082/134] objtool: Move objtool_file struct off the stack commit 0c671812f152b628bd87c0af49da032cc2a2c319 upstream. Objtool uses over 512k of stack, thanks to the hash table embedded in the objtool_file struct. This causes an unnecessarily large stack allocation and breaks users with low stack limits. Move the struct off the stack. Fixes: 042ba73fe7eb ("objtool: Add several performance improvements") Reported-by: Vassili Karpov Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/df92dcbc4b84b02ffa252f46876df125fb56e2d7.1552954176.git.jpoimboe@redhat.com Signed-off-by: Greg Kroah-Hartman --- tools/objtool/check.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index e128d1c71c30..3ff025b64527 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2132,9 +2132,10 @@ static void cleanup(struct objtool_file *file) elf_close(file->elf); } +static struct objtool_file file; + int check(const char *_objname, bool orc) { - struct objtool_file file; int ret, warnings = 0; objname = _objname; From a37fe2be55b6c965e616f5d46e2490005355d427 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Wed, 24 Oct 2018 18:48:24 +0300 Subject: [PATCH 083/134] ALSA: x86: Fix runtime PM for hdmi-lpe-audio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8dfb839cfe737a17def8e5f88ee13c295230364a upstream. Commit 46e831abe864 ("drm/i915/lpe: Mark LPE audio runtime pm as "no callbacks"") broke runtime PM with lpe audio. We can no longer runtime suspend the GPU since the sysfs power/control for the lpe-audio device no longer exists and the device is considered always active. We can fix this by not marking the device as active. Cc: Chris Wilson Cc: Takashi Iwai Cc: Pierre-Louis Bossart Fixes: 46e831abe864 ("drm/i915/lpe: Mark LPE audio runtime pm as "no callbacks"") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20181024154825.18185-1-ville.syrjala@linux.intel.com Reviewed-by: Chris Wilson Acked-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/x86/intel_hdmi_audio.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/x86/intel_hdmi_audio.c b/sound/x86/intel_hdmi_audio.c index 8b7abbd69116..88fe5eb4516f 100644 --- a/sound/x86/intel_hdmi_audio.c +++ b/sound/x86/intel_hdmi_audio.c @@ -1887,7 +1887,6 @@ static int hdmi_lpe_audio_probe(struct platform_device *pdev) pm_runtime_use_autosuspend(&pdev->dev); pm_runtime_mark_last_busy(&pdev->dev); - pm_runtime_set_active(&pdev->dev); dev_dbg(&pdev->dev, "%s: handle pending notification\n", __func__); for_each_port(card_ctx, port) { From 03c5cdb620217763e3ef0012e54508f4ec6f88ef Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Thu, 14 Mar 2019 23:19:22 -0400 Subject: [PATCH 084/134] ext4: fix NULL pointer dereference while journal is aborted commit fa30dde38aa8628c73a6dded7cb0bba38c27b576 upstream. We see the following NULL pointer dereference while running xfstests generic/475: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 PGD 8000000c84bad067 P4D 8000000c84bad067 PUD c84e62067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 7 PID: 9886 Comm: fsstress Kdump: loaded Not tainted 5.0.0-rc8 #10 RIP: 0010:ext4_do_update_inode+0x4ec/0x760 ... Call Trace: ? jbd2_journal_get_write_access+0x42/0x50 ? __ext4_journal_get_write_access+0x2c/0x70 ? ext4_truncate+0x186/0x3f0 ext4_mark_iloc_dirty+0x61/0x80 ext4_mark_inode_dirty+0x62/0x1b0 ext4_truncate+0x186/0x3f0 ? unmap_mapping_pages+0x56/0x100 ext4_setattr+0x817/0x8b0 notify_change+0x1df/0x430 do_truncate+0x5e/0x90 ? generic_permission+0x12b/0x1a0 This is triggered because the NULL pointer handle->h_transaction was dereferenced in function ext4_update_inode_fsync_trans(). I found that the h_transaction was set to NULL in jbd2__journal_restart but failed to attached to a new transaction while the journal is aborted. Fix this by checking the handle before updating the inode. Fixes: b436b9bef84d ("ext4: Wait for proper transaction commit on fsync") Signed-off-by: Jiufei Xue Signed-off-by: Theodore Ts'o Reviewed-by: Joseph Qi Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4_jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 48143e32411c..1437f62d068c 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -387,7 +387,7 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle, { struct ext4_inode_info *ei = EXT4_I(inode); - if (ext4_handle_valid(handle)) { + if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) { ei->i_sync_tid = handle->h_transaction->t_tid; if (datasync) ei->i_datasync_tid = handle->h_transaction->t_tid; From 766b823eafb885aaa25ac979f5e5a10f051c9634 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 14 Mar 2019 23:20:25 -0400 Subject: [PATCH 085/134] ext4: fix data corruption caused by unaligned direct AIO commit 372a03e01853f860560eade508794dd274e9b390 upstream. Ext4 needs to serialize unaligned direct AIO because the zeroing of partial blocks of two competing unaligned AIOs can result in data corruption. However it decides not to serialize if the potentially unaligned aio is past i_size with the rationale that no pending writes are possible past i_size. Unfortunately if the i_size is not block aligned and the second unaligned write lands past i_size, but still into the same block, it has the potential of corrupting the previous unaligned write to the same block. This is (very simplified) reproducer from Frank // 41472 = (10 * 4096) + 512 // 37376 = 41472 - 4096 ftruncate(fd, 41472); io_prep_pwrite(iocbs[0], fd, buf[0], 4096, 37376); io_prep_pwrite(iocbs[1], fd, buf[1], 4096, 41472); io_submit(io_ctx, 1, &iocbs[1]); io_submit(io_ctx, 1, &iocbs[2]); io_getevents(io_ctx, 2, 2, events, NULL); Without this patch the 512B range from 40960 up to the start of the second unaligned write (41472) is going to be zeroed overwriting the data written by the first write. This is a data corruption. 00000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 00009200 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 * 0000a000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0000a200 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 With this patch the data corruption is avoided because we will recognize the unaligned_aio and wait for the unwritten extent conversion. 00000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 00009200 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 * 0000a200 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 * 0000b200 Reported-by: Frank Sorenson Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o Fixes: e9e3bcecf44c ("ext4: serialize unaligned asynchronous DIO") Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 5cb9aa3ad249..1913c69498c1 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -123,7 +123,7 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) struct super_block *sb = inode->i_sb; int blockmask = sb->s_blocksize - 1; - if (pos >= i_size_read(inode)) + if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize)) return 0; if ((pos | iov_iter_alignment(from)) & blockmask) From 33fb49969357376c93a20ba6fd55087b8c181ae6 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Sat, 23 Mar 2019 11:43:05 -0400 Subject: [PATCH 086/134] ext4: brelse all indirect buffer in ext4_ind_remove_space() commit 674a2b27234d1b7afcb0a9162e81b2e53aeef217 upstream. All indirect buffers get by ext4_find_shared() should be released no mater the branch should be freed or not. But now, we forget to release the lower depth indirect buffers when removing space from the same higher depth indirect block. It will lead to buffer leak and futher more, it may lead to quota information corruption when using old quota, consider the following case. - Create and mount an empty ext4 filesystem without extent and quota features, - quotacheck and enable the user & group quota, - Create some files and write some data to them, and then punch hole to some files of them, it may trigger the buffer leak problem mentioned above. - Disable quota and run quotacheck again, it will create two new aquota files and write the checked quota information to them, which probably may reuse the freed indirect block(the buffer and page cache was not freed) as data block. - Enable quota again, it will invoke vfs_load_quota_inode()->invalidate_bdev() to try to clean unused buffers and pagecache. Unfortunately, because of the buffer of quota data block is still referenced, quota code cannot read the up to date quota info from the device and lead to quota information corruption. This problem can be reproduced by xfstests generic/231 on ext3 file system or ext4 file system without extent and quota features. This patch fix this problem by releasing the missing indirect buffers, in ext4_ind_remove_space(). Reported-by: Hulk Robot Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/indirect.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index bf7fa1507e81..9e96a0bd08d9 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1387,10 +1387,14 @@ end_range: partial->p + 1, partial2->p, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + } + while (partial2 > chain2) { + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + } return 0; } From 3616a46e4622df40e2d9c1a22c305e769eff9775 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 18 Dec 2018 08:37:08 -0500 Subject: [PATCH 087/134] media: v4l2-ctrls.c/uvc: zero v4l2_event commit f45f3f753b0a3d739acda8e311b4f744d82dc52a upstream. Control events can leak kernel memory since they do not fully zero the event. The same code is present in both v4l2-ctrls.c and uvc_ctrl.c, so fix both. It appears that all other event code is properly zeroing the structure, it's these two places. Signed-off-by: Hans Verkuil Reported-by: syzbot+4f021cf3697781dbd9fb@syzkaller.appspotmail.com Reviewed-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/usb/uvc/uvc_ctrl.c | 2 +- drivers/media/v4l2-core/v4l2-ctrls.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/usb/uvc/uvc_ctrl.c b/drivers/media/usb/uvc/uvc_ctrl.c index 20397aba6849..d92967e2e385 100644 --- a/drivers/media/usb/uvc/uvc_ctrl.c +++ b/drivers/media/usb/uvc/uvc_ctrl.c @@ -1203,7 +1203,7 @@ static void uvc_ctrl_fill_event(struct uvc_video_chain *chain, __uvc_query_v4l2_ctrl(chain, ctrl, mapping, &v4l2_ctrl); - memset(ev->reserved, 0, sizeof(ev->reserved)); + memset(ev, 0, sizeof(*ev)); ev->type = V4L2_EVENT_CTRL; ev->id = v4l2_ctrl.id; ev->u.ctrl.value = value; diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c index 8033d6f73501..07bd2008ae4b 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls.c +++ b/drivers/media/v4l2-core/v4l2-ctrls.c @@ -1239,7 +1239,7 @@ static u32 user_flags(const struct v4l2_ctrl *ctrl) static void fill_event(struct v4l2_event *ev, struct v4l2_ctrl *ctrl, u32 changes) { - memset(ev->reserved, 0, sizeof(ev->reserved)); + memset(ev, 0, sizeof(*ev)); ev->type = V4L2_EVENT_CTRL; ev->id = ctrl->id; ev->u.ctrl.changes = changes; From 86384a1fa3e5389e58d8a6673ed09143abe1354e Mon Sep 17 00:00:00 2001 From: Myungho Jung Date: Tue, 22 Jan 2019 00:33:26 -0800 Subject: [PATCH 088/134] Bluetooth: hci_uart: Check if socket buffer is ERR_PTR in h4_recv_buf() commit 1dc2d785156cbdc80806c32e8d2c7c735d0b4721 upstream. h4_recv_buf() callers store the return value to socket buffer and recursively pass the buffer to h4_recv_buf() without protection. So, ERR_PTR returned from h4_recv_buf() can be dereferenced, if called again before setting the socket buffer to NULL from previous error. Check if skb is ERR_PTR in h4_recv_buf(). Reported-by: syzbot+017a32f149406df32703@syzkaller.appspotmail.com Signed-off-by: Myungho Jung Signed-off-by: Marcel Holtmann Signed-off-by: Greg Kroah-Hartman --- drivers/bluetooth/hci_h4.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/bluetooth/hci_h4.c b/drivers/bluetooth/hci_h4.c index 3b82a87224a9..d428117c97c3 100644 --- a/drivers/bluetooth/hci_h4.c +++ b/drivers/bluetooth/hci_h4.c @@ -174,6 +174,10 @@ struct sk_buff *h4_recv_buf(struct hci_dev *hdev, struct sk_buff *skb, struct hci_uart *hu = hci_get_drvdata(hdev); u8 alignment = hu->alignment ? hu->alignment : 1; + /* Check for error from previous call */ + if (IS_ERR(skb)) + skb = NULL; + while (count) { int i, len; From 3df00eb895f8ea16ccbfb6db49dc905f99ac9d17 Mon Sep 17 00:00:00 2001 From: Myungho Jung Date: Sat, 2 Feb 2019 16:56:36 -0800 Subject: [PATCH 089/134] Bluetooth: Fix decrementing reference count twice in releasing socket commit e20a2e9c42c9e4002d9e338d74e7819e88d77162 upstream. When releasing socket, it is possible to enter hci_sock_release() and hci_sock_dev_event(HCI_DEV_UNREG) at the same time in different thread. The reference count of hdev should be decremented only once from one of them but if storing hdev to local variable in hci_sock_release() before detached from socket and setting to NULL in hci_sock_dev_event(), hci_dev_put(hdev) is unexpectedly called twice. This is resolved by referencing hdev from socket after bt_sock_unlink() in hci_sock_release(). Reported-by: syzbot+fdc00003f4efff43bc5b@syzkaller.appspotmail.com Signed-off-by: Myungho Jung Signed-off-by: Marcel Holtmann Signed-off-by: Greg Kroah-Hartman --- net/bluetooth/hci_sock.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 65d734c165bd..4a05235929b9 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -826,8 +826,6 @@ static int hci_sock_release(struct socket *sock) if (!sk) return 0; - hdev = hci_pi(sk)->hdev; - switch (hci_pi(sk)->channel) { case HCI_CHANNEL_MONITOR: atomic_dec(&monitor_promisc); @@ -849,6 +847,7 @@ static int hci_sock_release(struct socket *sock) bt_sock_unlink(&hci_sk_list, sk); + hdev = hci_pi(sk)->hdev; if (hdev) { if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { /* When releasing a user channel exclusive access, From 6ea83d9338c181de28ca60cbf221b0a4b94c4fd9 Mon Sep 17 00:00:00 2001 From: Jeremy Cline Date: Wed, 6 Feb 2019 12:54:16 -0500 Subject: [PATCH 090/134] Bluetooth: hci_ldisc: Initialize hci_dev before open() commit 32a7b4cbe93b0a0ef7e63d31ca69ce54736c4412 upstream. The hci_dev struct hdev is referenced in work queues and timers started by open() in some protocols. This creates a race between the initialization function and the work or timer which can result hdev being dereferenced while it is still null. The syzbot report contains a reliable reproducer which causes a null pointer dereference of hdev in hci_uart_write_work() by making the memory allocation for hdev fail. To fix this, ensure hdev is valid from before calling a protocol's open() until after calling a protocol's close(). Reported-by: syzbot+257790c15bcdef6fe00c@syzkaller.appspotmail.com Signed-off-by: Jeremy Cline Signed-off-by: Marcel Holtmann Signed-off-by: Greg Kroah-Hartman --- drivers/bluetooth/hci_ldisc.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c index 30bbe19b4b85..6a3a3011b38e 100644 --- a/drivers/bluetooth/hci_ldisc.c +++ b/drivers/bluetooth/hci_ldisc.c @@ -207,11 +207,11 @@ static void hci_uart_init_work(struct work_struct *work) err = hci_register_dev(hu->hdev); if (err < 0) { BT_ERR("Can't register HCI device"); + clear_bit(HCI_UART_PROTO_READY, &hu->flags); + hu->proto->close(hu); hdev = hu->hdev; hu->hdev = NULL; hci_free_dev(hdev); - clear_bit(HCI_UART_PROTO_READY, &hu->flags); - hu->proto->close(hu); return; } @@ -612,6 +612,7 @@ static void hci_uart_tty_receive(struct tty_struct *tty, const u8 *data, static int hci_uart_register_dev(struct hci_uart *hu) { struct hci_dev *hdev; + int err; BT_DBG(""); @@ -655,11 +656,22 @@ static int hci_uart_register_dev(struct hci_uart *hu) else hdev->dev_type = HCI_PRIMARY; + /* Only call open() for the protocol after hdev is fully initialized as + * open() (or a timer/workqueue it starts) may attempt to reference it. + */ + err = hu->proto->open(hu); + if (err) { + hu->hdev = NULL; + hci_free_dev(hdev); + return err; + } + if (test_bit(HCI_UART_INIT_PENDING, &hu->hdev_flags)) return 0; if (hci_register_dev(hdev) < 0) { BT_ERR("Can't register HCI device"); + hu->proto->close(hu); hu->hdev = NULL; hci_free_dev(hdev); return -ENODEV; @@ -679,17 +691,12 @@ static int hci_uart_set_proto(struct hci_uart *hu, int id) if (!p) return -EPROTONOSUPPORT; - err = p->open(hu); - if (err) - return err; - hu->proto = p; set_bit(HCI_UART_PROTO_READY, &hu->flags); err = hci_uart_register_dev(hu); if (err) { clear_bit(HCI_UART_PROTO_READY, &hu->flags); - p->close(hu); return err; } From a1dbb34da6f2edf321df5023cb2accd92579269b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 23 Feb 2019 12:33:27 +0800 Subject: [PATCH 091/134] Bluetooth: hci_ldisc: Postpone HCI_UART_PROTO_READY bit set in hci_uart_set_proto() commit 56897b217a1d0a91c9920cb418d6b3fe922f590a upstream. task A: task B: hci_uart_set_proto flush_to_ldisc - p->open(hu) -> h5_open //alloc h5 - receive_buf - set_bit HCI_UART_PROTO_READY - tty_port_default_receive_buf - hci_uart_register_dev - tty_ldisc_receive_buf - hci_uart_tty_receive - test_bit HCI_UART_PROTO_READY - h5_recv - clear_bit HCI_UART_PROTO_READY while() { - p->open(hu) -> h5_close //free h5 - h5_rx_3wire_hdr - h5_reset() //use-after-free } It could use ioctl to set hci uart proto, but there is a use-after-free issue when hci_uart_register_dev() fail in hci_uart_set_proto(), see stack above, fix this by setting HCI_UART_PROTO_READY bit only when hci_uart_register_dev() return success. Reported-by: syzbot+899a33dc0fa0dbaf06a6@syzkaller.appspotmail.com Signed-off-by: Kefeng Wang Reviewed-by: Jeremy Cline Signed-off-by: Marcel Holtmann Signed-off-by: Greg Kroah-Hartman --- drivers/bluetooth/hci_ldisc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c index 6a3a3011b38e..3b63a781f10f 100644 --- a/drivers/bluetooth/hci_ldisc.c +++ b/drivers/bluetooth/hci_ldisc.c @@ -692,14 +692,13 @@ static int hci_uart_set_proto(struct hci_uart *hu, int id) return -EPROTONOSUPPORT; hu->proto = p; - set_bit(HCI_UART_PROTO_READY, &hu->flags); err = hci_uart_register_dev(hu); if (err) { - clear_bit(HCI_UART_PROTO_READY, &hu->flags); return err; } + set_bit(HCI_UART_PROTO_READY, &hu->flags); return 0; } From 4f160d8dd684f8bb12f3a21bfe88ad37667246c4 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sun, 30 Dec 2018 12:28:42 +0000 Subject: [PATCH 092/134] drm: Reorder set_property_atomic to avoid returning with an active ww_ctx commit 227ad6d957898a88b1746e30234ece64d305f066 upstream. Delay the drm_modeset_acquire_init() until after we check for an allocation failure so that we can return immediately upon error without having to unwind. WARNING: lock held when returning to user space! 4.20.0+ #174 Not tainted ------------------------------------------------ syz-executor556/8153 is leaving the kernel with locks still held! 1 lock held by syz-executor556/8153: #0: 000000005100c85c (crtc_ww_class_acquire){+.+.}, at: set_property_atomic+0xb3/0x330 drivers/gpu/drm/drm_mode_object.c:462 Reported-by: syzbot+6ea337c427f5083ebdf2@syzkaller.appspotmail.com Fixes: 144a7999d633 ("drm: Handle properties in the core for atomic drivers") Signed-off-by: Chris Wilson Cc: Daniel Vetter Cc: Maarten Lankhorst Cc: Sean Paul Cc: David Airlie Cc: # v4.14+ Reviewed-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20181230122842.21917-1-chris@chris-wilson.co.uk Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/drm_mode_object.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/drm_mode_object.c b/drivers/gpu/drm/drm_mode_object.c index 1055533792f3..5b692ce6a45d 100644 --- a/drivers/gpu/drm/drm_mode_object.c +++ b/drivers/gpu/drm/drm_mode_object.c @@ -432,12 +432,13 @@ static int set_property_atomic(struct drm_mode_object *obj, struct drm_modeset_acquire_ctx ctx; int ret; - drm_modeset_acquire_init(&ctx, 0); - state = drm_atomic_state_alloc(dev); if (!state) return -ENOMEM; + + drm_modeset_acquire_init(&ctx, 0); state->acquire_ctx = &ctx; + retry: if (prop == state->dev->mode_config.dpms_property) { if (obj->type != DRM_MODE_OBJECT_CONNECTOR) { From b763bd262a792684cdca3040ad9f45106304df62 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 19 Feb 2019 00:37:21 +0100 Subject: [PATCH 093/134] netfilter: ebtables: remove BUGPRINT messages commit d824548dae220820bdf69b2d1561b7c4b072783f upstream. They are however frequently triggered by syzkaller, so remove them. ebtables userspace should never trigger any of these, so there is little value in making them pr_debug (or ratelimited). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman --- net/bridge/netfilter/ebtables.c | 131 ++++++++++---------------------- 1 file changed, 39 insertions(+), 92 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 53392ac58b38..38b3309edba8 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -31,10 +31,6 @@ /* needed for logical [in,out]-dev filtering */ #include "../br_private.h" -#define BUGPRINT(format, args...) printk("kernel msg: ebtables bug: please "\ - "report to author: "format, ## args) -/* #define BUGPRINT(format, args...) */ - /* Each cpu has its own set of counters, so there is no need for write_lock in * the softirq * For reading or updating the counters, the user context needs to @@ -453,8 +449,6 @@ static int ebt_verify_pointers(const struct ebt_replace *repl, /* we make userspace set this right, * so there is no misunderstanding */ - BUGPRINT("EBT_ENTRY_OR_ENTRIES shouldn't be set " - "in distinguisher\n"); return -EINVAL; } if (i != NF_BR_NUMHOOKS) @@ -472,18 +466,14 @@ static int ebt_verify_pointers(const struct ebt_replace *repl, offset += e->next_offset; } } - if (offset != limit) { - BUGPRINT("entries_size too small\n"); + if (offset != limit) return -EINVAL; - } /* check if all valid hooks have a chain */ for (i = 0; i < NF_BR_NUMHOOKS; i++) { if (!newinfo->hook_entry[i] && - (valid_hooks & (1 << i))) { - BUGPRINT("Valid hook without chain\n"); + (valid_hooks & (1 << i))) return -EINVAL; - } } return 0; } @@ -510,26 +500,20 @@ ebt_check_entry_size_and_hooks(const struct ebt_entry *e, /* this checks if the previous chain has as many entries * as it said it has */ - if (*n != *cnt) { - BUGPRINT("nentries does not equal the nr of entries " - "in the chain\n"); + if (*n != *cnt) return -EINVAL; - } + if (((struct ebt_entries *)e)->policy != EBT_DROP && ((struct ebt_entries *)e)->policy != EBT_ACCEPT) { /* only RETURN from udc */ if (i != NF_BR_NUMHOOKS || - ((struct ebt_entries *)e)->policy != EBT_RETURN) { - BUGPRINT("bad policy\n"); + ((struct ebt_entries *)e)->policy != EBT_RETURN) return -EINVAL; - } } if (i == NF_BR_NUMHOOKS) /* it's a user defined chain */ (*udc_cnt)++; - if (((struct ebt_entries *)e)->counter_offset != *totalcnt) { - BUGPRINT("counter_offset != totalcnt"); + if (((struct ebt_entries *)e)->counter_offset != *totalcnt) return -EINVAL; - } *n = ((struct ebt_entries *)e)->nentries; *cnt = 0; return 0; @@ -537,15 +521,13 @@ ebt_check_entry_size_and_hooks(const struct ebt_entry *e, /* a plain old entry, heh */ if (sizeof(struct ebt_entry) > e->watchers_offset || e->watchers_offset > e->target_offset || - e->target_offset >= e->next_offset) { - BUGPRINT("entry offsets not in right order\n"); + e->target_offset >= e->next_offset) return -EINVAL; - } + /* this is not checked anywhere else */ - if (e->next_offset - e->target_offset < sizeof(struct ebt_entry_target)) { - BUGPRINT("target size too small\n"); + if (e->next_offset - e->target_offset < sizeof(struct ebt_entry_target)) return -EINVAL; - } + (*cnt)++; (*totalcnt)++; return 0; @@ -665,18 +647,15 @@ ebt_check_entry(struct ebt_entry *e, struct net *net, if (e->bitmask == 0) return 0; - if (e->bitmask & ~EBT_F_MASK) { - BUGPRINT("Unknown flag for bitmask\n"); + if (e->bitmask & ~EBT_F_MASK) return -EINVAL; - } - if (e->invflags & ~EBT_INV_MASK) { - BUGPRINT("Unknown flag for inv bitmask\n"); + + if (e->invflags & ~EBT_INV_MASK) return -EINVAL; - } - if ((e->bitmask & EBT_NOPROTO) && (e->bitmask & EBT_802_3)) { - BUGPRINT("NOPROTO & 802_3 not allowed\n"); + + if ((e->bitmask & EBT_NOPROTO) && (e->bitmask & EBT_802_3)) return -EINVAL; - } + /* what hook do we belong to? */ for (i = 0; i < NF_BR_NUMHOOKS; i++) { if (!newinfo->hook_entry[i]) @@ -735,13 +714,11 @@ ebt_check_entry(struct ebt_entry *e, struct net *net, t->u.target = target; if (t->u.target == &ebt_standard_target) { if (gap < sizeof(struct ebt_standard_target)) { - BUGPRINT("Standard target size too big\n"); ret = -EFAULT; goto cleanup_watchers; } if (((struct ebt_standard_target *)t)->verdict < -NUM_STANDARD_TARGETS) { - BUGPRINT("Invalid standard target\n"); ret = -EFAULT; goto cleanup_watchers; } @@ -801,10 +778,9 @@ static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack if (strcmp(t->u.name, EBT_STANDARD_TARGET)) goto letscontinue; if (e->target_offset + sizeof(struct ebt_standard_target) > - e->next_offset) { - BUGPRINT("Standard target size too big\n"); + e->next_offset) return -1; - } + verdict = ((struct ebt_standard_target *)t)->verdict; if (verdict >= 0) { /* jump to another chain */ struct ebt_entries *hlp2 = @@ -813,14 +789,12 @@ static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack if (hlp2 == cl_s[i].cs.chaininfo) break; /* bad destination or loop */ - if (i == udc_cnt) { - BUGPRINT("bad destination\n"); + if (i == udc_cnt) return -1; - } - if (cl_s[i].cs.n) { - BUGPRINT("loop\n"); + + if (cl_s[i].cs.n) return -1; - } + if (cl_s[i].hookmask & (1 << hooknr)) goto letscontinue; /* this can't be 0, so the loop test is correct */ @@ -853,24 +827,21 @@ static int translate_table(struct net *net, const char *name, i = 0; while (i < NF_BR_NUMHOOKS && !newinfo->hook_entry[i]) i++; - if (i == NF_BR_NUMHOOKS) { - BUGPRINT("No valid hooks specified\n"); + if (i == NF_BR_NUMHOOKS) return -EINVAL; - } - if (newinfo->hook_entry[i] != (struct ebt_entries *)newinfo->entries) { - BUGPRINT("Chains don't start at beginning\n"); + + if (newinfo->hook_entry[i] != (struct ebt_entries *)newinfo->entries) return -EINVAL; - } + /* make sure chains are ordered after each other in same order * as their corresponding hooks */ for (j = i + 1; j < NF_BR_NUMHOOKS; j++) { if (!newinfo->hook_entry[j]) continue; - if (newinfo->hook_entry[j] <= newinfo->hook_entry[i]) { - BUGPRINT("Hook order must be followed\n"); + if (newinfo->hook_entry[j] <= newinfo->hook_entry[i]) return -EINVAL; - } + i = j; } @@ -888,15 +859,11 @@ static int translate_table(struct net *net, const char *name, if (ret != 0) return ret; - if (i != j) { - BUGPRINT("nentries does not equal the nr of entries in the " - "(last) chain\n"); + if (i != j) return -EINVAL; - } - if (k != newinfo->nentries) { - BUGPRINT("Total nentries is wrong\n"); + + if (k != newinfo->nentries) return -EINVAL; - } /* get the location of the udc, put them in an array * while we're at it, allocate the chainstack @@ -929,7 +896,6 @@ static int translate_table(struct net *net, const char *name, ebt_get_udc_positions, newinfo, &i, cl_s); /* sanity check */ if (i != udc_cnt) { - BUGPRINT("i != udc_cnt\n"); vfree(cl_s); return -EFAULT; } @@ -1030,7 +996,6 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, goto free_unlock; if (repl->num_counters && repl->num_counters != t->private->nentries) { - BUGPRINT("Wrong nr. of counters requested\n"); ret = -EINVAL; goto free_unlock; } @@ -1115,15 +1080,12 @@ static int do_replace(struct net *net, const void __user *user, if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; - if (len != sizeof(tmp) + tmp.entries_size) { - BUGPRINT("Wrong len argument\n"); + if (len != sizeof(tmp) + tmp.entries_size) return -EINVAL; - } - if (tmp.entries_size == 0) { - BUGPRINT("Entries_size never zero\n"); + if (tmp.entries_size == 0) return -EINVAL; - } + /* overflow check */ if (tmp.nentries >= ((INT_MAX - sizeof(struct ebt_table_info)) / NR_CPUS - SMP_CACHE_BYTES) / sizeof(struct ebt_counter)) @@ -1150,7 +1112,6 @@ static int do_replace(struct net *net, const void __user *user, } if (copy_from_user( newinfo->entries, tmp.entries, tmp.entries_size) != 0) { - BUGPRINT("Couldn't copy entries from userspace\n"); ret = -EFAULT; goto free_entries; } @@ -1197,10 +1158,8 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, if (input_table == NULL || (repl = input_table->table) == NULL || repl->entries == NULL || repl->entries_size == 0 || - repl->counters != NULL || input_table->private != NULL) { - BUGPRINT("Bad table data for ebt_register_table!!!\n"); + repl->counters != NULL || input_table->private != NULL) return -EINVAL; - } /* Don't add one table to multiple lists. */ table = kmemdup(input_table, sizeof(struct ebt_table), GFP_KERNEL); @@ -1238,13 +1197,10 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, ((char *)repl->hook_entry[i] - repl->entries); } ret = translate_table(net, repl->name, newinfo); - if (ret != 0) { - BUGPRINT("Translate_table failed\n"); + if (ret != 0) goto free_chainstack; - } if (table->check && table->check(newinfo, table->valid_hooks)) { - BUGPRINT("The table doesn't like its own initial data, lol\n"); ret = -EINVAL; goto free_chainstack; } @@ -1255,7 +1211,6 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, list_for_each_entry(t, &net->xt.tables[NFPROTO_BRIDGE], list) { if (strcmp(t->name, table->name) == 0) { ret = -EEXIST; - BUGPRINT("Table name already exists\n"); goto free_unlock; } } @@ -1327,7 +1282,6 @@ static int do_update_counters(struct net *net, const char *name, goto free_tmp; if (num_counters != t->private->nentries) { - BUGPRINT("Wrong nr of counters\n"); ret = -EINVAL; goto unlock_mutex; } @@ -1452,10 +1406,8 @@ static int copy_counters_to_user(struct ebt_table *t, if (num_counters == 0) return 0; - if (num_counters != nentries) { - BUGPRINT("Num_counters wrong\n"); + if (num_counters != nentries) return -EINVAL; - } counterstmp = vmalloc(nentries * sizeof(*counterstmp)); if (!counterstmp) @@ -1501,15 +1453,11 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user, (tmp.num_counters ? nentries * sizeof(struct ebt_counter) : 0)) return -EINVAL; - if (tmp.nentries != nentries) { - BUGPRINT("Nentries wrong\n"); + if (tmp.nentries != nentries) return -EINVAL; - } - if (tmp.entries_size != entries_size) { - BUGPRINT("Wrong size\n"); + if (tmp.entries_size != entries_size) return -EINVAL; - } ret = copy_counters_to_user(t, oldcounters, tmp.counters, tmp.num_counters, nentries); @@ -1581,7 +1529,6 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) } mutex_unlock(&ebt_mutex); if (copy_to_user(user, &tmp, *len) != 0) { - BUGPRINT("c2u Didn't work\n"); ret = -EFAULT; break; } From c60f18c6653fe6675bd66fb2b7008d3fc14e957a Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 1 Mar 2019 04:12:00 +0100 Subject: [PATCH 094/134] x86/unwind: Handle NULL pointer calls better in frame unwinder commit f4f34e1b82eb4219d8eaa1c7e2e17ca219a6a2b5 upstream. When the frame unwinder is invoked for an oops caused by a call to NULL, it currently skips the parent function because BP still points to the parent's stack frame; the (nonexistent) current function only has the first half of a stack frame, and BP doesn't point to it yet. Add a special case for IP==0 that calculates a fake BP from SP, then uses the real BP for the next frame. Note that this handles first_frame specially: Return information about the parent function as long as the saved IP is >=first_frame, even if the fake BP points below it. With an artificially-added NULL call in prctl_set_seccomp(), before this patch, the trace is: Call Trace: ? prctl_set_seccomp+0x3a/0x50 __x64_sys_prctl+0x457/0x6f0 ? __ia32_sys_prctl+0x750/0x750 do_syscall_64+0x72/0x160 entry_SYSCALL_64_after_hwframe+0x44/0xa9 After this patch, the trace is: Call Trace: prctl_set_seccomp+0x3a/0x50 __x64_sys_prctl+0x457/0x6f0 ? __ia32_sys_prctl+0x750/0x750 do_syscall_64+0x72/0x160 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Signed-off-by: Jann Horn Signed-off-by: Thomas Gleixner Acked-by: Josh Poimboeuf Cc: Borislav Petkov Cc: Andrew Morton Cc: syzbot Cc: "H. Peter Anvin" Cc: Masahiro Yamada Cc: Michal Marek Cc: linux-kbuild@vger.kernel.org Link: https://lkml.kernel.org/r/20190301031201.7416-1-jannh@google.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/unwind.h | 6 ++++++ arch/x86/kernel/unwind_frame.c | 25 ++++++++++++++++++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 1f86e1b0a5cd..499578f7e6d7 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -23,6 +23,12 @@ struct unwind_state { #elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; + /* + * If non-NULL: The current frame is incomplete and doesn't contain a + * valid BP. When looking for the next frame, use this instead of the + * non-existent saved BP. + */ + unsigned long *next_bp; struct pt_regs *regs; #else unsigned long *sp; diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 3dc26f95d46e..9b9fd4826e7a 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -320,10 +320,14 @@ bool unwind_next_frame(struct unwind_state *state) } /* Get the next frame pointer: */ - if (state->regs) + if (state->next_bp) { + next_bp = state->next_bp; + state->next_bp = NULL; + } else if (state->regs) { next_bp = (unsigned long *)state->regs->bp; - else + } else { next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp); + } /* Move to the next frame if it's safe: */ if (!update_stack_state(state, next_bp)) @@ -398,6 +402,21 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, bp = get_frame_pointer(task, regs); + /* + * If we crash with IP==0, the last successfully executed instruction + * was probably an indirect function call with a NULL function pointer. + * That means that SP points into the middle of an incomplete frame: + * *SP is a return pointer, and *(SP-sizeof(unsigned long)) is where we + * would have written a frame pointer if we hadn't crashed. + * Pretend that the frame is complete and that BP points to it, but save + * the real BP so that we can use it when looking for the next frame. + */ + if (regs && regs->ip == 0 && + (unsigned long *)kernel_stack_pointer(regs) >= first_frame) { + state->next_bp = bp; + bp = ((unsigned long *)kernel_stack_pointer(regs)) - 1; + } + /* Initialize stack info and make sure the frame data is accessible: */ get_stack_info(bp, state->task, &state->stack_info, &state->stack_mask); @@ -410,7 +429,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, */ while (!unwind_done(state) && (!on_stack(&state->stack_info, first_frame, sizeof(long)) || - state->bp < first_frame)) + (state->next_bp == NULL && state->bp < first_frame))) unwind_next_frame(state); } EXPORT_SYMBOL_GPL(__unwind_start); From 5befc25f5cd2a1ec90bca48d4e03270b1005c70b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 1 Mar 2019 04:12:01 +0100 Subject: [PATCH 095/134] x86/unwind: Add hardcoded ORC entry for NULL commit ac5ceccce5501e43d217c596e4ee859f2a3fef79 upstream. When the ORC unwinder is invoked for an oops caused by IP==0, it currently has no idea what to do because there is no debug information for the stack frame of NULL. But if RIP is NULL, it is very likely that the last successfully executed instruction was an indirect CALL/JMP, and it is possible to unwind out in the same way as for the first instruction of a normal function. Hardcode a corresponding ORC entry. With an artificially-added NULL call in prctl_set_seccomp(), before this patch, the trace is: Call Trace: ? __x64_sys_prctl+0x402/0x680 ? __ia32_sys_prctl+0x6e0/0x6e0 ? __do_page_fault+0x457/0x620 ? do_syscall_64+0x6d/0x160 ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 After this patch, the trace looks like this: Call Trace: __x64_sys_prctl+0x402/0x680 ? __ia32_sys_prctl+0x6e0/0x6e0 ? __do_page_fault+0x457/0x620 do_syscall_64+0x6d/0x160 entry_SYSCALL_64_after_hwframe+0x44/0xa9 prctl_set_seccomp() still doesn't show up in the trace because for some reason, tail call optimization is only disabled in builds that use the frame pointer unwinder. Signed-off-by: Jann Horn Signed-off-by: Thomas Gleixner Acked-by: Josh Poimboeuf Cc: Borislav Petkov Cc: Andrew Morton Cc: syzbot Cc: "H. Peter Anvin" Cc: Masahiro Yamada Cc: Michal Marek Cc: linux-kbuild@vger.kernel.org Link: https://lkml.kernel.org/r/20190301031201.7416-2-jannh@google.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/unwind_orc.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index be86a865087a..3bbb399f7ead 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -74,11 +74,28 @@ static struct orc_entry *orc_module_find(unsigned long ip) } #endif +/* + * If we crash with IP==0, the last successfully executed instruction + * was probably an indirect function call with a NULL function pointer, + * and we don't have unwind information for NULL. + * This hardcoded ORC entry for IP==0 allows us to unwind from a NULL function + * pointer into its parent and then continue normally from there. + */ +static struct orc_entry null_orc_entry = { + .sp_offset = sizeof(long), + .sp_reg = ORC_REG_SP, + .bp_reg = ORC_REG_UNDEFINED, + .type = ORC_TYPE_CALL +}; + static struct orc_entry *orc_find(unsigned long ip) { if (!orc_init) return NULL; + if (ip == 0) + return &null_orc_entry; + /* For non-init vmlinux addresses, use the fast lookup table: */ if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) { unsigned int idx, start, stop; From 4a195a0bc2e954b91085d5c82eb20c51835ee7b0 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 9 Jan 2019 23:03:25 -0500 Subject: [PATCH 096/134] locking/lockdep: Add debug_locks check in __lock_downgrade() commit 71492580571467fb7177aade19c18ce7486267f5 upstream. Tetsuo Handa had reported he saw an incorrect "downgrading a read lock" warning right after a previous lockdep warning. It is likely that the previous warning turned off lock debugging causing the lockdep to have inconsistency states leading to the lock downgrade warning. Fix that by add a check for debug_locks at the beginning of __lock_downgrade(). Debugged-by: Tetsuo Handa Reported-by: Tetsuo Handa Reported-by: syzbot+53383ae265fb161ef488@syzkaller.appspotmail.com Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: https://lkml.kernel.org/r/1547093005-26085-1-git-send-email-longman@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/locking/lockdep.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index bf694c709b96..e57be7031cb3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3650,6 +3650,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name, unsigned int depth; int i; + if (unlikely(!debug_locks)) + return 0; + depth = curr->lockdep_depth; /* * This function is about (re)setting the class of a held lock, From 886e8316b599a1f704f90e768547cc63725a8474 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 29 Jan 2019 14:03:33 +0100 Subject: [PATCH 097/134] ALSA: hda - Record the current power state before suspend/resume calls commit 98081ca62cbac31fb0f7efaf90b2e7384ce22257 upstream. Currently we deal with single codec and suspend codec callbacks for all S3, S4 and runtime PM handling. But it turned out that we want distinguish the call patterns sometimes, e.g. for applying some init sequence only at probing and restoring from hibernate. This patch slightly modifies the common PM callbacks for HD-audio codec and stores the currently processed PM event in power_state of the codec's device.power field, which is currently unused. The codec callback can take a look at this event value and judges which purpose it's being called. Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/hda_codec.c | 43 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index 8a027973f2ad..cdb43de7d659 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -2900,6 +2900,7 @@ static void hda_call_codec_resume(struct hda_codec *codec) hda_jackpoll_work(&codec->jackpoll_work.work); else snd_hda_jack_report_sync(codec); + codec->core.dev.power.power_state = PMSG_ON; atomic_dec(&codec->core.in_pm); } @@ -2932,10 +2933,48 @@ static int hda_codec_runtime_resume(struct device *dev) } #endif /* CONFIG_PM */ +#ifdef CONFIG_PM_SLEEP +static int hda_codec_pm_suspend(struct device *dev) +{ + dev->power.power_state = PMSG_SUSPEND; + return pm_runtime_force_suspend(dev); +} + +static int hda_codec_pm_resume(struct device *dev) +{ + dev->power.power_state = PMSG_RESUME; + return pm_runtime_force_resume(dev); +} + +static int hda_codec_pm_freeze(struct device *dev) +{ + dev->power.power_state = PMSG_FREEZE; + return pm_runtime_force_suspend(dev); +} + +static int hda_codec_pm_thaw(struct device *dev) +{ + dev->power.power_state = PMSG_THAW; + return pm_runtime_force_resume(dev); +} + +static int hda_codec_pm_restore(struct device *dev) +{ + dev->power.power_state = PMSG_RESTORE; + return pm_runtime_force_resume(dev); +} +#endif /* CONFIG_PM_SLEEP */ + /* referred in hda_bind.c */ const struct dev_pm_ops hda_codec_driver_pm = { - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) +#ifdef CONFIG_PM_SLEEP + .suspend = hda_codec_pm_suspend, + .resume = hda_codec_pm_resume, + .freeze = hda_codec_pm_freeze, + .thaw = hda_codec_pm_thaw, + .poweroff = hda_codec_pm_suspend, + .restore = hda_codec_pm_restore, +#endif /* CONFIG_PM_SLEEP */ SET_RUNTIME_PM_OPS(hda_codec_runtime_suspend, hda_codec_runtime_resume, NULL) }; From 5c622e33da8febc0de0512344cd5a35c17a7676f Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Tue, 19 Mar 2019 09:28:44 +0800 Subject: [PATCH 098/134] ALSA: hda - Enforces runtime_resume after S3 and S4 for each codec commit b5a236c175b0d984552a5f7c9d35141024c2b261 upstream. Recently we found the audio jack detection stop working after suspend on many machines with Realtek codec. Sometimes the audio selection dialogue didn't show up after users plugged headhphone/headset into the headset jack, sometimes after uses plugged headphone/headset, then click the sound icon on the upper-right corner of gnome-desktop, it also showed the speaker rather than the headphone. The root cause is that before suspend, the codec already call the runtime_suspend since this codec is not used by any apps, then in resume, it will not call runtime_resume for this codec. But for some realtek codec (so far, alc236, alc255 and alc891) with the specific BIOS, if it doesn't run runtime_resume after suspend, all codec functions including jack detection stop working anymore. This problem existed for a long time, but it was not exposed, that is because when problem happens, if users play sound or open sound-setting to check audio device, this will trigger calling to runtime_resume (via snd_hda_power_up), then the codec starts working again before users notice this problem. Since we don't know how many codec and BIOS combinations have this problem, to fix it, let the driver call runtime_resume for all codecs in pm_resume, maybe for some codecs, this is not needed, but it is harmless. After a codec is runtime resumed, if it is not used by any apps, it will be runtime suspended soon and furthermore we don't run suspend frequently, this change will not add much power consumption. Fixes: cc72da7d4d06 ("ALSA: hda - Use standard runtime PM for codec power-save control") Signed-off-by: Hui Wang Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/hda_codec.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index cdb43de7d659..e3f3351da480 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -2934,6 +2934,20 @@ static int hda_codec_runtime_resume(struct device *dev) #endif /* CONFIG_PM */ #ifdef CONFIG_PM_SLEEP +static int hda_codec_force_resume(struct device *dev) +{ + int ret; + + /* The get/put pair below enforces the runtime resume even if the + * device hasn't been used at suspend time. This trick is needed to + * update the jack state change during the sleep. + */ + pm_runtime_get_noresume(dev); + ret = pm_runtime_force_resume(dev); + pm_runtime_put(dev); + return ret; +} + static int hda_codec_pm_suspend(struct device *dev) { dev->power.power_state = PMSG_SUSPEND; @@ -2943,7 +2957,7 @@ static int hda_codec_pm_suspend(struct device *dev) static int hda_codec_pm_resume(struct device *dev) { dev->power.power_state = PMSG_RESUME; - return pm_runtime_force_resume(dev); + return hda_codec_force_resume(dev); } static int hda_codec_pm_freeze(struct device *dev) @@ -2955,13 +2969,13 @@ static int hda_codec_pm_freeze(struct device *dev) static int hda_codec_pm_thaw(struct device *dev) { dev->power.power_state = PMSG_THAW; - return pm_runtime_force_resume(dev); + return hda_codec_force_resume(dev); } static int hda_codec_pm_restore(struct device *dev) { dev->power.power_state = PMSG_RESTORE; - return pm_runtime_force_resume(dev); + return hda_codec_force_resume(dev); } #endif /* CONFIG_PM_SLEEP */ From aab86217763b06bcb563cb69dbbff8d598b52a39 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Nov 2017 15:28:04 -0800 Subject: [PATCH 099/134] lib/int_sqrt: optimize small argument commit 3f3295709edea6268ff1609855f498035286af73 upstream. The current int_sqrt() computation is sub-optimal for the case of small @x. Which is the interesting case when we're going to do cumulative distribution functions on idle times, which we assume to be a random variable, where the target residency of the deepest idle state gives an upper bound on the variable (5e6ns on recent Intel chips). In the case of small @x, the compute loop: while (m != 0) { b = y + m; y >>= 1; if (x >= b) { x -= b; y += m; } m >>= 2; } can be reduced to: while (m > x) m >>= 2; Because y==0, b==m and until x>=m y will remain 0. And while this is computationally equivalent, it runs much faster because there's less code, in particular less branches. cycles: branches: branch-misses: OLD: hot: 45.109444 +- 0.044117 44.333392 +- 0.002254 0.018723 +- 0.000593 cold: 187.737379 +- 0.156678 44.333407 +- 0.002254 6.272844 +- 0.004305 PRE: hot: 67.937492 +- 0.064124 66.999535 +- 0.000488 0.066720 +- 0.001113 cold: 232.004379 +- 0.332811 66.999527 +- 0.000488 6.914634 +- 0.006568 POST: hot: 43.633557 +- 0.034373 45.333132 +- 0.002277 0.023529 +- 0.000681 cold: 207.438411 +- 0.125840 45.333132 +- 0.002277 6.976486 +- 0.004219 Averages computed over all values <128k using a LFSR to generate order. Cold numbers have a LFSR based branch trace buffer 'confuser' ran between each int_sqrt() invocation. Link: http://lkml.kernel.org/r/20171020164644.876503355@infradead.org Fixes: 30493cc9dddb ("lib/int_sqrt.c: optimize square root algorithm") Signed-off-by: Peter Zijlstra (Intel) Suggested-by: Anshul Garg Acked-by: Linus Torvalds Cc: Davidlohr Bueso Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Will Deacon Cc: Joe Perches Cc: David Miller Cc: Matthew Wilcox Cc: Kees Cook Cc: Michael Davidson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- lib/int_sqrt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c index db0b5aa071fc..036c96781ea8 100644 --- a/lib/int_sqrt.c +++ b/lib/int_sqrt.c @@ -23,6 +23,9 @@ unsigned long int_sqrt(unsigned long x) return x; m = 1UL << (BITS_PER_LONG - 2); + while (m > x) + m >>= 2; + while (m != 0) { b = y + m; y >>= 1; From 4a4aed9055e8e8e6e7becf10821d5f2e1c90c3a8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 11 Dec 2017 22:48:41 +0100 Subject: [PATCH 100/134] USB: core: only clean up what we allocated commit 32fd87b3bbf5f7a045546401dfe2894dbbf4d8c3 upstream. When cleaning up the configurations, make sure we only free the number of configurations and interfaces that we could have allocated. Reported-by: Andrey Konovalov Cc: stable Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index bd749e78df59..1a6ccdd5a5fc 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -768,18 +768,21 @@ void usb_destroy_configuration(struct usb_device *dev) return; if (dev->rawdescriptors) { - for (i = 0; i < dev->descriptor.bNumConfigurations; i++) + for (i = 0; i < dev->descriptor.bNumConfigurations && + i < USB_MAXCONFIG; i++) kfree(dev->rawdescriptors[i]); kfree(dev->rawdescriptors); dev->rawdescriptors = NULL; } - for (c = 0; c < dev->descriptor.bNumConfigurations; c++) { + for (c = 0; c < dev->descriptor.bNumConfigurations && + c < USB_MAXCONFIG; c++) { struct usb_host_config *cf = &dev->config[c]; kfree(cf->string); - for (i = 0; i < cf->desc.bNumInterfaces; i++) { + for (i = 0; i < cf->desc.bNumInterfaces && + i < USB_MAXINTERFACES; i++) { if (cf->intf_cache[i]) kref_put(&cf->intf_cache[i]->ref, usb_release_interface_cache); From 4aac26ecb4ace3df840af95408f2279c85684f62 Mon Sep 17 00:00:00 2001 From: kehuanlin Date: Wed, 6 Sep 2017 17:58:39 +0800 Subject: [PATCH 101/134] scsi: ufs: fix wrong command type of UTRD for UFSHCI v2.1 commit 83dc7e3dea76b77b6bcc289eb86c5b5c145e8dff upstream. Since the command type of UTRD in UFS 2.1 specification is the same with UFS 2.0. And it assumes the future UFS specification will follow the same definition. Signed-off-by: kehuanlin Reviewed-by: Subhash Jadavani Signed-off-by: Martin K. Petersen Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/ufs/ufshcd.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 66540491839e..581571de2461 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -2195,10 +2195,11 @@ static int ufshcd_comp_devman_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) u32 upiu_flags; int ret = 0; - if (hba->ufs_version == UFSHCI_VERSION_20) - lrbp->command_type = UTP_CMD_TYPE_UFS_STORAGE; - else + if ((hba->ufs_version == UFSHCI_VERSION_10) || + (hba->ufs_version == UFSHCI_VERSION_11)) lrbp->command_type = UTP_CMD_TYPE_DEV_MANAGE; + else + lrbp->command_type = UTP_CMD_TYPE_UFS_STORAGE; ufshcd_prepare_req_desc_hdr(lrbp, &upiu_flags, DMA_NONE); if (hba->dev_cmd.type == DEV_CMD_TYPE_QUERY) @@ -2222,10 +2223,11 @@ static int ufshcd_comp_scsi_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) u32 upiu_flags; int ret = 0; - if (hba->ufs_version == UFSHCI_VERSION_20) - lrbp->command_type = UTP_CMD_TYPE_UFS_STORAGE; - else + if ((hba->ufs_version == UFSHCI_VERSION_10) || + (hba->ufs_version == UFSHCI_VERSION_11)) lrbp->command_type = UTP_CMD_TYPE_SCSI; + else + lrbp->command_type = UTP_CMD_TYPE_UFS_STORAGE; if (likely(lrbp->cmd)) { ufshcd_prepare_req_desc_hdr(lrbp, &upiu_flags, From a9d76f59faffc0447701b902ed8322d35c45a983 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 20 Dec 2017 00:29:23 +0100 Subject: [PATCH 102/134] PCI: designware-ep: dw_pcie_ep_set_msi() should only set MMC bits commit 099a95f3591ade29da52131895a3ba9f92a0e82c upstream. Previously, dw_pcie_ep_set_msi() wrote all bits in the Message Control register, thus overwriting the PCI_MSI_FLAGS_64BIT bit. By clearing the PCI_MSI_FLAGS_64BIT bit, we break MSI on systems where the RC has set a 64 bit MSI address. Fix dw_pcie_ep_set_msi() so that it only sets MMC bits. Tested-by: Gustavo Pimentel Signed-off-by: Niklas Cassel Signed-off-by: Lorenzo Pieralisi Acked-by: Joao Pinto Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/pci/dwc/pcie-designware-ep.c | 4 +++- drivers/pci/dwc/pcie-designware.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/pci/dwc/pcie-designware-ep.c b/drivers/pci/dwc/pcie-designware-ep.c index 7c621877a939..df317d390317 100644 --- a/drivers/pci/dwc/pcie-designware-ep.c +++ b/drivers/pci/dwc/pcie-designware-ep.c @@ -214,7 +214,9 @@ static int dw_pcie_ep_set_msi(struct pci_epc *epc, u8 encode_int) struct dw_pcie_ep *ep = epc_get_drvdata(epc); struct dw_pcie *pci = to_dw_pcie_from_ep(ep); - val = (encode_int << MSI_CAP_MMC_SHIFT); + val = dw_pcie_readw_dbi(pci, MSI_MESSAGE_CONTROL); + val &= ~MSI_CAP_MMC_MASK; + val |= (encode_int << MSI_CAP_MMC_SHIFT) & MSI_CAP_MMC_MASK; dw_pcie_writew_dbi(pci, MSI_MESSAGE_CONTROL, val); return 0; diff --git a/drivers/pci/dwc/pcie-designware.h b/drivers/pci/dwc/pcie-designware.h index 3551dd607b90..5af29d125c7e 100644 --- a/drivers/pci/dwc/pcie-designware.h +++ b/drivers/pci/dwc/pcie-designware.h @@ -99,6 +99,7 @@ #define MSI_MESSAGE_CONTROL 0x52 #define MSI_CAP_MMC_SHIFT 1 +#define MSI_CAP_MMC_MASK (7 << MSI_CAP_MMC_SHIFT) #define MSI_CAP_MME_SHIFT 4 #define MSI_CAP_MSI_EN_MASK 0x1 #define MSI_CAP_MME_MASK (7 << MSI_CAP_MME_SHIFT) From dd913d610e214b97bba81b1b76883d0f29d63a37 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 20 Dec 2017 00:29:24 +0100 Subject: [PATCH 103/134] PCI: designware-ep: Read-only registers need DBI_RO_WR_EN to be writable commit 1cab826b30c6275d479a6ab1dea1067e15dbec62 upstream. Certain registers that pcie-designware-ep tries to write to are read-only registers. However, these registers can become read/write if we first enable the DBI_RO_WR_EN bit. Set/unset the DBI_RO_WR_EN bit before/after writing these registers. Tested-by: Gustavo Pimentel Signed-off-by: Niklas Cassel Signed-off-by: Lorenzo Pieralisi Acked-by: Joao Pinto Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/pci/dwc/pcie-designware-ep.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/pci/dwc/pcie-designware-ep.c b/drivers/pci/dwc/pcie-designware-ep.c index df317d390317..abcbf0770358 100644 --- a/drivers/pci/dwc/pcie-designware-ep.c +++ b/drivers/pci/dwc/pcie-designware-ep.c @@ -35,8 +35,10 @@ static void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar) u32 reg; reg = PCI_BASE_ADDRESS_0 + (4 * bar); + dw_pcie_dbi_ro_wr_en(pci); dw_pcie_writel_dbi2(pci, reg, 0x0); dw_pcie_writel_dbi(pci, reg, 0x0); + dw_pcie_dbi_ro_wr_dis(pci); } static int dw_pcie_ep_write_header(struct pci_epc *epc, @@ -45,6 +47,7 @@ static int dw_pcie_ep_write_header(struct pci_epc *epc, struct dw_pcie_ep *ep = epc_get_drvdata(epc); struct dw_pcie *pci = to_dw_pcie_from_ep(ep); + dw_pcie_dbi_ro_wr_en(pci); dw_pcie_writew_dbi(pci, PCI_VENDOR_ID, hdr->vendorid); dw_pcie_writew_dbi(pci, PCI_DEVICE_ID, hdr->deviceid); dw_pcie_writeb_dbi(pci, PCI_REVISION_ID, hdr->revid); @@ -58,6 +61,7 @@ static int dw_pcie_ep_write_header(struct pci_epc *epc, dw_pcie_writew_dbi(pci, PCI_SUBSYSTEM_ID, hdr->subsys_id); dw_pcie_writeb_dbi(pci, PCI_INTERRUPT_PIN, hdr->interrupt_pin); + dw_pcie_dbi_ro_wr_dis(pci); return 0; } @@ -142,8 +146,10 @@ static int dw_pcie_ep_set_bar(struct pci_epc *epc, enum pci_barno bar, if (ret) return ret; + dw_pcie_dbi_ro_wr_en(pci); dw_pcie_writel_dbi2(pci, reg, size - 1); dw_pcie_writel_dbi(pci, reg, flags); + dw_pcie_dbi_ro_wr_dis(pci); return 0; } @@ -217,7 +223,9 @@ static int dw_pcie_ep_set_msi(struct pci_epc *epc, u8 encode_int) val = dw_pcie_readw_dbi(pci, MSI_MESSAGE_CONTROL); val &= ~MSI_CAP_MMC_MASK; val |= (encode_int << MSI_CAP_MMC_SHIFT) & MSI_CAP_MMC_MASK; + dw_pcie_dbi_ro_wr_en(pci); dw_pcie_writew_dbi(pci, MSI_MESSAGE_CONTROL, val); + dw_pcie_dbi_ro_wr_dis(pci); return 0; } From 421abdcc909322d6595504cf8d20c1adf74bc1cd Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 11 Jan 2018 14:00:57 +0530 Subject: [PATCH 104/134] PCI: endpoint: Use EPC's device in dma_alloc_coherent()/dma_free_coherent() commit b330104fa76df3eae6e199a23791fed5d35f06b4 upstream. After commit 723288836628 ("of: restrict DMA configuration"), of_dma_configure() doesn't configure the coherent_dma_mask/dma_mask of endpoint function device (since it doesn't have a DT node associated with and hence no dma-ranges property), resulting in dma_alloc_coherent() (used in pci_epf_alloc_space()) to fail. Fix it by making dma_alloc_coherent() use EPC's device for allocating memory address. Link: http://lkml.kernel.org/r/64d63468-d28f-8fcd-a6f3-cf2a6401c8cb@ti.com Signed-off-by: Kishon Vijay Abraham I [lorenzo.pieralisi@arm.com: tweaked commit log] Signed-off-by: Lorenzo Pieralisi Cc: Robin Murphy Cc: Rob Herring Cc: Christoph Hellwig Tested-by: Cyrille Pitchen Tested-by: Niklas Cassel Reviewed-by: Robin Murphy Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/pci/endpoint/pci-epc-core.c | 10 ---------- drivers/pci/endpoint/pci-epf-core.c | 4 ++-- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c index 42c2a1156325..cd7d4788b94d 100644 --- a/drivers/pci/endpoint/pci-epc-core.c +++ b/drivers/pci/endpoint/pci-epc-core.c @@ -18,7 +18,6 @@ */ #include -#include #include #include #include @@ -371,7 +370,6 @@ EXPORT_SYMBOL_GPL(pci_epc_write_header); int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf) { unsigned long flags; - struct device *dev = epc->dev.parent; if (epf->epc) return -EBUSY; @@ -383,12 +381,6 @@ int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf) return -EINVAL; epf->epc = epc; - if (dev->of_node) { - of_dma_configure(&epf->dev, dev->of_node); - } else { - dma_set_coherent_mask(&epf->dev, epc->dev.coherent_dma_mask); - epf->dev.dma_mask = epc->dev.dma_mask; - } spin_lock_irqsave(&epc->lock, flags); list_add_tail(&epf->list, &epc->pci_epf); @@ -503,9 +495,7 @@ __pci_epc_create(struct device *dev, const struct pci_epc_ops *ops, INIT_LIST_HEAD(&epc->pci_epf); device_initialize(&epc->dev); - dma_set_coherent_mask(&epc->dev, dev->coherent_dma_mask); epc->dev.class = pci_epc_class; - epc->dev.dma_mask = dev->dma_mask; epc->dev.parent = dev; epc->ops = ops; diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c index ae1611a62808..95ccc4b8a0a2 100644 --- a/drivers/pci/endpoint/pci-epf-core.c +++ b/drivers/pci/endpoint/pci-epf-core.c @@ -99,7 +99,7 @@ EXPORT_SYMBOL_GPL(pci_epf_bind); */ void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar) { - struct device *dev = &epf->dev; + struct device *dev = epf->epc->dev.parent; if (!addr) return; @@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(pci_epf_free_space); void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar) { void *space; - struct device *dev = &epf->dev; + struct device *dev = epf->epc->dev.parent; dma_addr_t phys_addr; if (size < 128) From c492471db905b6c7c673ae5630dd0a27c8cd00b0 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 25 Dec 2017 19:10:37 +0800 Subject: [PATCH 105/134] rtc: Fix overflow when converting time64_t to rtc_time commit 36d46cdb43efea74043e29e2a62b13e9aca31452 upstream. If we convert one large time values to rtc_time, in the original formula 'days * 86400' can be overflowed in 'unsigned int' type to make the formula get one incorrect remain seconds value. Thus we can use div_s64_rem() function to avoid this situation. Signed-off-by: Baolin Wang Acked-by: Arnd Bergmann Signed-off-by: Alexandre Belloni Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/rtc/rtc-lib.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/rtc/rtc-lib.c b/drivers/rtc/rtc-lib.c index 1ae7da5cfc60..ad5bb21908e5 100644 --- a/drivers/rtc/rtc-lib.c +++ b/drivers/rtc/rtc-lib.c @@ -52,13 +52,11 @@ EXPORT_SYMBOL(rtc_year_days); */ void rtc_time64_to_tm(time64_t time, struct rtc_time *tm) { - unsigned int month, year; - unsigned long secs; + unsigned int month, year, secs; int days; /* time must be positive */ - days = div_s64(time, 86400); - secs = time - (unsigned int) days * 86400; + days = div_s64_rem(time, 86400, &secs); /* day of the week, 1970-01-01 was a Thursday */ tm->tm_wday = (days + 4) % 7; From 423f2c97a25392635216d349b85a4c2440de3a45 Mon Sep 17 00:00:00 2001 From: Jules Maselbas Date: Thu, 29 Mar 2018 15:43:01 +0100 Subject: [PATCH 106/134] sched/cpufreq/schedutil: Fix error path mutex unlock commit 1b5d43cfb69759d8ef8d30469cea31d0c037aed5 upstream. This patch prevents the 'global_tunables_lock' mutex from being unlocked before being locked. This mutex is not locked if the sugov_kthread_create() function fails. Signed-off-by: Jules Maselbas Acked-by: Peter Zijlstra Cc: Chris Redpath Cc: Dietmar Eggermann Cc: Linus Torvalds Cc: Mike Galbraith Cc: Patrick Bellasi Cc: Stephen Kyle Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: nd@arm.com Link: http://lkml.kernel.org/r/20180329144301.38419-1-jules.maselbas@arm.com Signed-off-by: Ingo Molnar Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- kernel/sched/cpufreq_schedutil.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 81eb7899c7c8..b314c9eaa71d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -605,10 +605,9 @@ fail: stop_kthread: sugov_kthread_stop(sg_policy); - -free_sg_policy: mutex_unlock(&global_tunables_lock); +free_sg_policy: sugov_policy_free(sg_policy); disable_fast_switch: From 9c5f91cc9b2e8d4637182e9b43124af31803abbe Mon Sep 17 00:00:00 2001 From: Enric Balletbo i Serra Date: Wed, 28 Mar 2018 19:03:23 +0200 Subject: [PATCH 107/134] pwm-backlight: Enable/disable the PWM before/after LCD enable toggle. commit 5fb5caee92ba35a4a3baa61d45a78eb057e2c031 upstream. Before this patch the enable signal was set before the PWM signal and vice-versa on power off. This sequence is wrong, at least, it is on the different panels datasheets that I checked, so I inverted the sequence to follow the specs. For reference the following panels have the mentioned sequence: - N133HSE-EA1 (Innolux) - N116BGE (Innolux) - N156BGE-L21 (Innolux) - B101EAN0 (Auo) - B101AW03 (Auo) - LTN101NT05 (Samsung) - CLAA101WA01A (Chunghwa) Signed-off-by: Enric Balletbo i Serra Acked-by: Daniel Thompson Acked-by: Jingoo Han Acked-by: Thierry Reding Signed-off-by: Lee Jones Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/video/backlight/pwm_bl.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index 0fa7d2bd0e48..155153ecb894 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -54,10 +54,11 @@ static void pwm_backlight_power_on(struct pwm_bl_data *pb, int brightness) if (err < 0) dev_err(pb->dev, "failed to enable power supply\n"); + pwm_enable(pb->pwm); + if (pb->enable_gpio) gpiod_set_value_cansleep(pb->enable_gpio, 1); - pwm_enable(pb->pwm); pb->enabled = true; } @@ -66,12 +67,12 @@ static void pwm_backlight_power_off(struct pwm_bl_data *pb) if (!pb->enabled) return; - pwm_config(pb->pwm, 0, pb->period); - pwm_disable(pb->pwm); - if (pb->enable_gpio) gpiod_set_value_cansleep(pb->enable_gpio, 0); + pwm_config(pb->pwm, 0, pb->period); + pwm_disable(pb->pwm); + regulator_disable(pb->power_supply); pb->enabled = false; } From 69ef9ca4677e0e83ef8dc08f4eec8c466d218ba9 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 16 Nov 2018 19:01:10 +0800 Subject: [PATCH 108/134] power: supply: charger-manager: Fix incorrect return value commit f25a646fbe2051527ad9721853e892d13a99199e upstream. Fix incorrect return value. Signed-off-by: Baolin Wang Signed-off-by: Sebastian Reichel Signed-off-by: Greg Kroah-Hartman --- drivers/power/supply/charger-manager.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/power/supply/charger-manager.c b/drivers/power/supply/charger-manager.c index 6502fa7c2106..f60dfc213257 100644 --- a/drivers/power/supply/charger-manager.c +++ b/drivers/power/supply/charger-manager.c @@ -1212,7 +1212,6 @@ static int charger_extcon_init(struct charger_manager *cm, if (ret < 0) { pr_info("Cannot register extcon_dev for %s(cable: %s)\n", cable->extcon_name, cable->name); - ret = -EINVAL; } return ret; @@ -1629,7 +1628,7 @@ static int charger_manager_probe(struct platform_device *pdev) if (IS_ERR(desc)) { dev_err(&pdev->dev, "No platform data (desc) found\n"); - return -ENODEV; + return PTR_ERR(desc); } cm = devm_kzalloc(&pdev->dev, sizeof(*cm), GFP_KERNEL); From 0cc17a7a320324a84f7b4731841b0ec10e65214e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 29 Mar 2018 00:06:10 +0200 Subject: [PATCH 109/134] ath10k: avoid possible string overflow commit 6707ba0105a2d350710bc0a537a98f49eb4b895d upstream. The way that 'strncat' is used here raised a warning in gcc-8: drivers/net/wireless/ath/ath10k/wmi.c: In function 'ath10k_wmi_tpc_stats_final_disp_tables': drivers/net/wireless/ath/ath10k/wmi.c:4649:4: error: 'strncat' output truncated before terminating nul copying as many bytes from a string as its length [-Werror=stringop-truncation] Effectively, this is simply a strcat() but the use of strncat() suggests some form of overflow check. Regardless of whether this might actually overflow, using strlcat() instead of strncat() avoids the warning and makes the code more robust. Fixes: bc64d05220f3 ("ath10k: debugfs support to get final TPC stats for 10.4 variants") Signed-off-by: Arnd Bergmann Signed-off-by: Kalle Valo Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/ath10k/wmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c index 8cb47858eb00..ab8eb9cdfda0 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.c +++ b/drivers/net/wireless/ath/ath10k/wmi.c @@ -4309,7 +4309,7 @@ static void ath10k_tpc_config_disp_tables(struct ath10k *ar, rate_code[i], type); snprintf(buff, sizeof(buff), "%8d ", tpc[j]); - strncat(tpc_value, buff, strlen(buff)); + strlcat(tpc_value, buff, sizeof(tpc_value)); } tpc_stats->tpc_table[type].pream_idx[i] = pream_idx; tpc_stats->tpc_table[type].rate_code[i] = rate_code[i]; From 1848c32fad1666bdc04d40f857284ffcb55f694a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 27 Mar 2019 14:13:56 +0900 Subject: [PATCH 110/134] Linux 4.14.109 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 170411b62525..e02bced59a57 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 4 PATCHLEVEL = 14 -SUBLEVEL = 108 +SUBLEVEL = 109 EXTRAVERSION = NAME = Petit Gorille From c1a289854bf0406890f4f80911c88acf4d4f2bb1 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Tue, 19 Mar 2019 09:53:01 -0700 Subject: [PATCH 111/134] FROMGIT: binder: fix BUG_ON found by selinux-testsuite The selinux-testsuite found an issue resulting in a BUG_ON() where a conditional relied on a size_t going negative when checking the validity of a buffer offset. (cherry picked from commit 5997da82145bb7c9a56d834894cb81f81f219344 git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git char-misc-linus) Bug: 67668716 Bug: 129336614 Change-Id: Ib3b408717141deadddcb6b95ad98c0b97d9d98ea Fixes: 7a67a39320df ("binder: add function to copy binder object from buffer") Reported-by: Paul Moore Tested-by: Paul Moore Signed-off-by: Todd Kjos --- drivers/android/binder.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 94e2872cff03..36ad7483424c 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2239,7 +2239,8 @@ static size_t binder_get_object(struct binder_proc *proc, size_t object_size = 0; read_size = min_t(size_t, sizeof(*object), buffer->data_size - offset); - if (read_size < sizeof(*hdr) || !IS_ALIGNED(offset, sizeof(u32))) + if (offset > buffer->data_size || read_size < sizeof(*hdr) || + !IS_ALIGNED(offset, sizeof(u32))) return 0; binder_alloc_copy_from_buffer(&proc->alloc, object, buffer, offset, read_size); From e7d41267ad9a204ee71fe04c4a792790b7524cc2 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 27 Mar 2019 16:12:31 -0700 Subject: [PATCH 112/134] ANDROID: binder: remove extra declaration left after backport When backporting commit 1a7c3d9bb7a9 ("binder: create userspace-to-binder-buffer copy function"), an extra "int target_fd;" was left in the code. This resulted in the possibility of accessing an uninitialized variable which was flagged by gcc. Bug: 67668716 Change-Id: I787ed89579e9d40e8530d79be67cc663ec755e54 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 36ad7483424c..64824dc8fc22 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2781,7 +2781,7 @@ static int binder_translate_fd_array(struct binder_fd_array_object *fda, } for (fdi = 0; fdi < fda->num_fds; fdi++) { u32 fd; - int target_fd; + binder_size_t offset = fda_offset + fdi * sizeof(fd); binder_alloc_copy_from_buffer(&target_proc->alloc, From 35c32388f7db63540b2a84dfe51a824051371e4b Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Thu, 23 Nov 2017 10:55:24 +1100 Subject: [PATCH 113/134] UPSTREAM: docs: correct documentation for %pK Current documentation indicates that %pK prints a leading '0x'. This is not the case. Correct documentation for printk specifier %pK. Signed-off-by: Tobin C. Harding (cherry picked from commit 553d8e8b107159088cc4e2855a2bd9a358365e3f) Signed-off-by: Sandeep Patil Bug: 78533979 Test: n/a Change-Id: I055e337b876fa3235c65182fbe6b85f17da4295f --- Documentation/printk-formats.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index d1aecf53badb..1858f5e1ccba 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -85,13 +85,12 @@ Examples:: printk("Faulted at %pS\n", (void *)regs->ip); printk(" %s%pB\n", (reliable ? "" : "? "), (void *)*stack); - Kernel Pointers =============== :: - %pK 0x01234567 or 0x0123456789abcdef + %pK 01234567 or 0123456789abcdef For printing kernel pointers which should be hidden from unprivileged users. The behaviour of ``%pK`` depends on the ``kptr_restrict sysctl`` - see From 3a8bb21cd5320b612e88acb866fc0570b044041a Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Thu, 23 Nov 2017 10:56:39 +1100 Subject: [PATCH 114/134] UPSTREAM: vsprintf: refactor %pK code out of pointer() Currently code to handle %pK is all within the switch statement in pointer(). This is the wrong level of abstraction. Each of the other switch clauses call a helper function, pK should do the same. Refactor code out of pointer() to new function restricted_pointer(). Signed-off-by: Tobin C. Harding (cherry picked from commit 57e734423adda83f3b05505875343284efe3b39c) Signed-off-by: Sandeep Patil Bug: 78533979 Test: Build and boot cuttlefish Change-Id: Ie78731eb52813a696a33d831b52ae7542ab2a90f --- lib/vsprintf.c | 97 ++++++++++++++++++++++++++++---------------------- 1 file changed, 54 insertions(+), 43 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 4a990f3fd345..811ef125ae2c 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1343,6 +1343,59 @@ char *uuid_string(char *buf, char *end, const u8 *addr, return string(buf, end, uuid, spec); } +int kptr_restrict __read_mostly; + +static noinline_for_stack +char *restricted_pointer(char *buf, char *end, const void *ptr, + struct printf_spec spec) +{ + spec.base = 16; + spec.flags |= SMALL; + if (spec.field_width == -1) { + spec.field_width = 2 * sizeof(ptr); + spec.flags |= ZEROPAD; + } + + switch (kptr_restrict) { + case 0: + /* Always print %pK values */ + break; + case 1: { + const struct cred *cred; + + /* + * kptr_restrict==1 cannot be used in IRQ context + * because its test for CAP_SYSLOG would be meaningless. + */ + if (in_irq() || in_serving_softirq() || in_nmi()) + return string(buf, end, "pK-error", spec); + + /* + * Only print the real pointer value if the current + * process has CAP_SYSLOG and is running with the + * same credentials it started with. This is because + * access to files is checked at open() time, but %pK + * checks permission at read() time. We don't want to + * leak pointer values if a binary opens a file using + * %pK and then elevates privileges before reading it. + */ + cred = current_cred(); + if (!has_capability_noaudit(current, CAP_SYSLOG) || + !uid_eq(cred->euid, cred->uid) || + !gid_eq(cred->egid, cred->gid)) + ptr = NULL; + break; + } + case 2: + default: + /* Always print 0's for %pK */ + ptr = NULL; + break; + } + + return number(buf, end, (unsigned long)ptr, spec); +} + static noinline_for_stack char *netdev_bits(char *buf, char *end, const void *addr, const char *fmt) { @@ -1588,8 +1641,6 @@ char *device_node_string(char *buf, char *end, struct device_node *dn, return widen_string(buf, buf - buf_start, end, spec); } -int kptr_restrict __read_mostly; - /* * Show a '%p' thing. A kernel extension is that the '%p' is followed * by an extra set of alphanumeric characters that are extended format @@ -1789,47 +1840,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, return buf; } case 'K': - switch (kptr_restrict) { - case 0: - /* Always print %pK values */ - break; - case 1: { - const struct cred *cred; - - /* - * kptr_restrict==1 cannot be used in IRQ context - * because its test for CAP_SYSLOG would be meaningless. - */ - if (in_irq() || in_serving_softirq() || in_nmi()) { - if (spec.field_width == -1) - spec.field_width = default_width; - return string(buf, end, "pK-error", spec); - } - - /* - * Only print the real pointer value if the current - * process has CAP_SYSLOG and is running with the - * same credentials it started with. This is because - * access to files is checked at open() time, but %pK - * checks permission at read() time. We don't want to - * leak pointer values if a binary opens a file using - * %pK and then elevates privileges before reading it. - */ - cred = current_cred(); - if (!has_capability_noaudit(current, CAP_SYSLOG) || - !uid_eq(cred->euid, cred->uid) || - !gid_eq(cred->egid, cred->gid)) - ptr = NULL; - break; - } - case 2: - default: - /* Always print 0's for %pK */ - ptr = NULL; - break; - } - break; - + return restricted_pointer(buf, end, ptr, spec); case 'N': return netdev_bits(buf, end, ptr, fmt); case 'a': From d3ad09e4f77a4b35a60b7bc696379ee42ef3f442 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Wed, 1 Nov 2017 15:32:23 +1100 Subject: [PATCH 115/134] UPSTREAM: printk: hash addresses printed with %p Currently there exist approximately 14 000 places in the kernel where addresses are being printed using an unadorned %p. This potentially leaks sensitive information regarding the Kernel layout in memory. Many of these calls are stale, instead of fixing every call lets hash the address by default before printing. This will of course break some users, forcing code printing needed addresses to be updated. Code that _really_ needs the address will soon be able to use the new printk specifier %px to print the address. For what it's worth, usage of unadorned %p can be broken down as follows (thanks to Joe Perches). $ git grep -E '%p[^A-Za-z0-9]' | cut -f1 -d"/" | sort | uniq -c 1084 arch 20 block 10 crypto 32 Documentation 8121 drivers 1221 fs 143 include 101 kernel 69 lib 100 mm 1510 net 40 samples 7 scripts 11 security 166 sound 152 tools 2 virt Add function ptr_to_id() to map an address to a 32 bit unique identifier. Hash any unadorned usage of specifier %p and any malformed specifiers. Signed-off-by: Tobin C. Harding (cherry picked from commit ad67b74d2469d9b82aaa572d76474c95bc484d57) Signed-off-by: Sandeep Patil Bug: 78533979 Test: Build and boot cuttlefish Test: CONFIG_TEST_PRINTF Change-Id: I9aa3dea0d40a13f5c858ad2253a8a712195ab1d5 --- Documentation/printk-formats.txt | 12 +++- lib/test_printf.c | 108 ++++++++++++++++++++----------- lib/vsprintf.c | 81 +++++++++++++++++++++-- 3 files changed, 155 insertions(+), 46 deletions(-) diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index 1858f5e1ccba..65e69d860d52 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -5,7 +5,6 @@ How to get printk format specifiers right :Author: Randy Dunlap :Author: Andrew Murray - Integer types ============= @@ -45,6 +44,17 @@ return from vsnprintf. Raw pointer value SHOULD be printed with %p. The kernel supports the following extended format specifiers for pointer types: +Pointer Types +============= + +Pointers printed without a specifier extension (i.e unadorned %p) are +hashed to give a unique identifier without leaking kernel addresses to user +space. On 64 bit machines the first 32 bits are zeroed. + +:: + + %p abcdef12 or 00000000abcdef12 + Symbols/Function Pointers ========================= diff --git a/lib/test_printf.c b/lib/test_printf.c index 563f10e6876a..71ebfa43ad05 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -24,24 +24,6 @@ #define PAD_SIZE 16 #define FILL_CHAR '$' -#define PTR1 ((void*)0x01234567) -#define PTR2 ((void*)(long)(int)0xfedcba98) - -#if BITS_PER_LONG == 64 -#define PTR1_ZEROES "000000000" -#define PTR1_SPACES " " -#define PTR1_STR "1234567" -#define PTR2_STR "fffffffffedcba98" -#define PTR_WIDTH 16 -#else -#define PTR1_ZEROES "0" -#define PTR1_SPACES " " -#define PTR1_STR "1234567" -#define PTR2_STR "fedcba98" -#define PTR_WIDTH 8 -#endif -#define PTR_WIDTH_STR stringify(PTR_WIDTH) - static unsigned total_tests __initdata; static unsigned failed_tests __initdata; static char *test_buffer __initdata; @@ -217,30 +199,79 @@ test_string(void) test("a | | ", "%-3.s|%-3.0s|%-3.*s", "a", "b", 0, "c"); } +#define PLAIN_BUF_SIZE 64 /* leave some space so we don't oops */ + +#if BITS_PER_LONG == 64 + +#define PTR_WIDTH 16 +#define PTR ((void *)0xffff0123456789ab) +#define PTR_STR "ffff0123456789ab" +#define ZEROS "00000000" /* hex 32 zero bits */ + +static int __init +plain_format(void) +{ + char buf[PLAIN_BUF_SIZE]; + int nchars; + + nchars = snprintf(buf, PLAIN_BUF_SIZE, "%p", PTR); + + if (nchars != PTR_WIDTH || strncmp(buf, ZEROS, strlen(ZEROS)) != 0) + return -1; + + return 0; +} + +#else + +#define PTR_WIDTH 8 +#define PTR ((void *)0x456789ab) +#define PTR_STR "456789ab" + +static int __init +plain_format(void) +{ + /* Format is implicitly tested for 32 bit machines by plain_hash() */ + return 0; +} + +#endif /* BITS_PER_LONG == 64 */ + +static int __init +plain_hash(void) +{ + char buf[PLAIN_BUF_SIZE]; + int nchars; + + nchars = snprintf(buf, PLAIN_BUF_SIZE, "%p", PTR); + + if (nchars != PTR_WIDTH || strncmp(buf, PTR_STR, PTR_WIDTH) == 0) + return -1; + + return 0; +} + +/* + * We can't use test() to test %p because we don't know what output to expect + * after an address is hashed. + */ static void __init plain(void) { - test(PTR1_ZEROES PTR1_STR " " PTR2_STR, "%p %p", PTR1, PTR2); - /* - * The field width is overloaded for some %p extensions to - * pass another piece of information. For plain pointers, the - * behaviour is slightly odd: One cannot pass either the 0 - * flag nor a precision to %p without gcc complaining, and if - * one explicitly gives a field width, the number is no longer - * zero-padded. - */ - test("|" PTR1_STR PTR1_SPACES " | " PTR1_SPACES PTR1_STR "|", - "|%-*p|%*p|", PTR_WIDTH+2, PTR1, PTR_WIDTH+2, PTR1); - test("|" PTR2_STR " | " PTR2_STR "|", - "|%-*p|%*p|", PTR_WIDTH+2, PTR2, PTR_WIDTH+2, PTR2); + int err; - /* - * Unrecognized %p extensions are treated as plain %p, but the - * alphanumeric suffix is ignored (that is, does not occur in - * the output.) - */ - test("|"PTR1_ZEROES PTR1_STR"|", "|%p0y|", PTR1); - test("|"PTR2_STR"|", "|%p0y|", PTR2); + err = plain_hash(); + if (err) { + pr_warn("plain 'p' does not appear to be hashed\n"); + failed_tests++; + return; + } + + err = plain_format(); + if (err) { + pr_warn("hashing plain 'p' has unexpected format\n"); + failed_tests++; + } } static void __init @@ -251,6 +282,7 @@ symbol_ptr(void) static void __init kernel_ptr(void) { + /* We can't test this without access to kptr_restrict. */ } static void __init diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 811ef125ae2c..7c0a8ed8a0f5 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -33,6 +33,8 @@ #include #include #include +#include +#include #ifdef CONFIG_BLOCK #include #endif @@ -1641,6 +1643,73 @@ char *device_node_string(char *buf, char *end, struct device_node *dn, return widen_string(buf, buf - buf_start, end, spec); } +static bool have_filled_random_ptr_key __read_mostly; +static siphash_key_t ptr_key __read_mostly; + +static void fill_random_ptr_key(struct random_ready_callback *unused) +{ + get_random_bytes(&ptr_key, sizeof(ptr_key)); + /* + * have_filled_random_ptr_key==true is dependent on get_random_bytes(). + * ptr_to_id() needs to see have_filled_random_ptr_key==true + * after get_random_bytes() returns. + */ + smp_mb(); + WRITE_ONCE(have_filled_random_ptr_key, true); +} + +static struct random_ready_callback random_ready = { + .func = fill_random_ptr_key +}; + +static int __init initialize_ptr_random(void) +{ + int ret = add_random_ready_callback(&random_ready); + + if (!ret) { + return 0; + } else if (ret == -EALREADY) { + fill_random_ptr_key(&random_ready); + return 0; + } + + return ret; +} +early_initcall(initialize_ptr_random); + +/* Maps a pointer to a 32 bit unique identifier. */ +static char *ptr_to_id(char *buf, char *end, void *ptr, struct printf_spec spec) +{ + unsigned long hashval; + const int default_width = 2 * sizeof(ptr); + + if (unlikely(!have_filled_random_ptr_key)) { + spec.field_width = default_width; + /* string length must be less than default_width */ + return string(buf, end, "(ptrval)", spec); + } + +#ifdef CONFIG_64BIT + hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key); + /* + * Mask off the first 32 bits, this makes explicit that we have + * modified the address (and 32 bits is plenty for a unique ID). + */ + hashval = hashval & 0xffffffff; +#else + hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key); +#endif + + spec.flags |= SMALL; + if (spec.field_width == -1) { + spec.field_width = default_width; + spec.flags |= ZEROPAD; + } + spec.base = 16; + + return number(buf, end, hashval, spec); +} + /* * Show a '%p' thing. A kernel extension is that the '%p' is followed * by an extra set of alphanumeric characters that are extended format @@ -1751,6 +1820,9 @@ char *device_node_string(char *buf, char *end, struct device_node *dn, * Note: The difference between 'S' and 'F' is that on ia64 and ppc64 * function pointers are really function descriptors, which contain a * pointer to the real address. + * + * Note: The default behaviour (unadorned %p) is to hash the address, + * rendering it useful as a unique identifier. */ static noinline_for_stack char *pointer(const char *fmt, char *buf, char *end, void *ptr, @@ -1866,14 +1938,9 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, return device_node_string(buf, end, ptr, spec, fmt + 1); } } - spec.flags |= SMALL; - if (spec.field_width == -1) { - spec.field_width = default_width; - spec.flags |= ZEROPAD; - } - spec.base = 16; - return number(buf, end, (unsigned long) ptr, spec); + /* default is to _not_ leak addresses, hash before printing */ + return ptr_to_id(buf, end, ptr, spec); } /* From b27ce7346e29c146fd786bd0ddfd8654218ab342 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Thu, 23 Nov 2017 10:59:45 +1100 Subject: [PATCH 116/134] UPSTREAM: vsprintf: add printk specifier %px printk specifier %p now hashes all addresses before printing. Sometimes we need to see the actual unmodified address. This can be achieved using %lx but then we face the risk that if in future we want to change the way the Kernel handles printing of pointers we will have to grep through the already existent 50 000 %lx call sites. Let's add specifier %px as a clear, opt-in, way to print a pointer and maintain some level of isolation from all the other hex integer output within the Kernel. Add printk specifier %px to print the actual unmodified address. Signed-off-by: Tobin C. Harding (cherry picked from commit 7b1924a1d930eb27fc79c4e4e2a6c1c970623e68) Signed-off-by: Sandeep Patil Bug: 78533979 Test: Build and boot cuttlefish Change-Id: I3fe64ef81cfb62d49822511cf8087e17abc6da37 --- Documentation/printk-formats.txt | 18 +++++++++++++++++- lib/vsprintf.c | 18 ++++++++++++++++++ scripts/checkpatch.pl | 2 +- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index 65e69d860d52..bbfeeb0813d3 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -49,7 +49,8 @@ Pointer Types Pointers printed without a specifier extension (i.e unadorned %p) are hashed to give a unique identifier without leaking kernel addresses to user -space. On 64 bit machines the first 32 bits are zeroed. +space. On 64 bit machines the first 32 bits are zeroed. If you _really_ +want the address see %px below. :: @@ -106,6 +107,21 @@ For printing kernel pointers which should be hidden from unprivileged users. The behaviour of ``%pK`` depends on the ``kptr_restrict sysctl`` - see Documentation/sysctl/kernel.txt for more details. +Unmodified Addresses +==================== + +:: + + %px 01234567 or 0123456789abcdef + +For printing pointers when you _really_ want to print the address. Please +consider whether or not you are leaking sensitive information about the +Kernel layout in memory before printing pointers with %px. %px is +functionally equivalent to %lx. %px is preferred to %lx because it is more +uniquely grep'able. If, in the future, we need to modify the way the Kernel +handles printing pointers it will be nice to be able to find the call +sites. + Struct Resources ================ diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 7c0a8ed8a0f5..ac1f232152f0 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1643,6 +1643,20 @@ char *device_node_string(char *buf, char *end, struct device_node *dn, return widen_string(buf, buf - buf_start, end, spec); } +static noinline_for_stack +char *pointer_string(char *buf, char *end, const void *ptr, + struct printf_spec spec) +{ + spec.base = 16; + spec.flags |= SMALL; + if (spec.field_width == -1) { + spec.field_width = 2 * sizeof(ptr); + spec.flags |= ZEROPAD; + } + + return number(buf, end, (unsigned long int)ptr, spec); +} + static bool have_filled_random_ptr_key __read_mostly; static siphash_key_t ptr_key __read_mostly; @@ -1815,6 +1829,8 @@ static char *ptr_to_id(char *buf, char *end, void *ptr, struct printf_spec spec) * c major compatible string * C full compatible string * + * - 'x' For printing the address. Equivalent to "%lx". + * * ** Please update also Documentation/printk-formats.txt when making changes ** * * Note: The difference between 'S' and 'F' is that on ia64 and ppc64 @@ -1937,6 +1953,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, case 'F': return device_node_string(buf, end, ptr, spec, fmt + 1); } + case 'x': + return pointer_string(buf, end, ptr, spec); } /* default is to _not_ leak addresses, hash before printing */ diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 8b80bac055e4..bfc5dfcbbc9c 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5762,7 +5762,7 @@ sub process { for (my $count = $linenr; $count <= $lc; $count++) { my $fmt = get_quoted_string($lines[$count - 1], raw_line($count, 0)); $fmt =~ s/%%//g; - if ($fmt =~ /(\%[\*\d\.]*p(?![\WFfSsBKRraEhMmIiUDdgVCbGNO]).)/) { + if ($fmt =~ /(\%[\*\d\.]*p(?![\WFfSsBKRraEhMmIiUDdgVCbGNOx]).)/) { $bad_extension = $1; last; } From 6ea6d8b10507784c52de020008e8964b1a32f039 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Wed, 1 Nov 2017 15:32:22 +1100 Subject: [PATCH 117/134] UPSTREAM: kasan: use %px to print addresses instead of %p Pointers printed with %p are now hashed by default. Kasan needs the actual address. We can use the new printk specifier %px for this purpose. Use %px instead of %p to print addresses. Signed-off-by: Tobin C. Harding (cherry picked from commit 6424f6bb432752c7eb90cbeeb1c31d6125bba39a) Signed-off-by: Sandeep Patil Bug: 78533979 Test: Build and boot cuttlefish Change-Id: I8059d086c2cb2ec8ca01520acb2227335330e067 --- mm/kasan/report.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 6bcfb01ba038..410c8235e671 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -134,7 +134,7 @@ static void print_error_description(struct kasan_access_info *info) pr_err("BUG: KASAN: %s in %pS\n", bug_type, (void *)info->ip); - pr_err("%s of size %zu at addr %p by task %s/%d\n", + pr_err("%s of size %zu at addr %px by task %s/%d\n", info->is_write ? "Write" : "Read", info->access_size, info->access_addr, current->comm, task_pid_nr(current)); } @@ -206,7 +206,7 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, const char *rel_type; int rel_bytes; - pr_err("The buggy address belongs to the object at %p\n" + pr_err("The buggy address belongs to the object at %px\n" " which belongs to the cache %s of size %d\n", object, cache->name, cache->object_size); @@ -225,7 +225,7 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, } pr_err("The buggy address is located %d bytes %s of\n" - " %d-byte region [%p, %p)\n", + " %d-byte region [%px, %px)\n", rel_bytes, rel_type, cache->object_size, (void *)object_addr, (void *)(object_addr + cache->object_size)); } @@ -302,7 +302,7 @@ static void print_shadow_for_address(const void *addr) char shadow_buf[SHADOW_BYTES_PER_ROW]; snprintf(buffer, sizeof(buffer), - (i == 0) ? ">%p: " : " %p: ", kaddr); + (i == 0) ? ">%px: " : " %px: ", kaddr); /* * We should not pass a shadow pointer to generic * function, because generic functions may try to From 9916e5987e606e740b00589a7012b349ab50ddd8 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 14 Dec 2017 15:32:58 -0800 Subject: [PATCH 118/134] UPSTREAM: mm/slab.c: do not hash pointers when debugging slab If CONFIG_DEBUG_SLAB/CONFIG_DEBUG_SLAB_LEAK are enabled, the slab code prints extra debug information when e.g. corruption is detected. This includes pointers, which are not very useful when hashed. Fix this by using %px to print unhashed pointers instead where it makes sense, and by removing the printing of a last user pointer referring to code. [geert+renesas@glider.be: v2] Link: http://lkml.kernel.org/r/1513179267-2509-1-git-send-email-geert+renesas@glider.be Link: http://lkml.kernel.org/r/1512641861-5113-1-git-send-email-geert+renesas@glider.be Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p") Signed-off-by: Geert Uytterhoeven Acked-by: Christoph Lameter Acked-by: Linus Torvalds Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: "Tobin C . Harding" Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 85c3e4a5a185f22649c6bf33bdce7bb1ac890921) Signed-off-by: Sandeep Patil Bug: 78533979 Test: Build and boot cuttlefish Change-Id: I721a4ca47adfbb70b3fa1859a66a3c5fbca0dd00 --- mm/slab.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 09df506ae830..f748728bf329 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1589,11 +1589,8 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) *dbg_redzone2(cachep, objp)); } - if (cachep->flags & SLAB_STORE_USER) { - pr_err("Last user: [<%p>](%pSR)\n", - *dbg_userword(cachep, objp), - *dbg_userword(cachep, objp)); - } + if (cachep->flags & SLAB_STORE_USER) + pr_err("Last user: (%pSR)\n", *dbg_userword(cachep, objp)); realobj = (char *)objp + obj_offset(cachep); size = cachep->object_size; for (i = 0; i < size && lines; i += 16, lines--) { @@ -1626,7 +1623,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Mismatch ! */ /* Print header */ if (lines == 0) { - pr_err("Slab corruption (%s): %s start=%p, len=%d\n", + pr_err("Slab corruption (%s): %s start=%px, len=%d\n", print_tainted(), cachep->name, realobj, size); print_objinfo(cachep, objp, 0); @@ -1655,13 +1652,13 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) if (objnr) { objp = index_to_obj(cachep, page, objnr - 1); realobj = (char *)objp + obj_offset(cachep); - pr_err("Prev obj: start=%p, len=%d\n", realobj, size); + pr_err("Prev obj: start=%px, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { objp = index_to_obj(cachep, page, objnr + 1); realobj = (char *)objp + obj_offset(cachep); - pr_err("Next obj: start=%p, len=%d\n", realobj, size); + pr_err("Next obj: start=%px, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } } @@ -2612,7 +2609,7 @@ static void slab_put_obj(struct kmem_cache *cachep, /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { if (get_free_obj(page, i) == objnr) { - pr_err("slab: double free detected in cache '%s', objp %p\n", + pr_err("slab: double free detected in cache '%s', objp %px\n", cachep->name, objp); BUG(); } @@ -2776,7 +2773,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) else slab_error(cache, "memory outside object was overwritten"); - pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", obj, redzone1, redzone2); } @@ -3082,7 +3079,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { slab_error(cachep, "double free, or memory outside object was overwritten"); - pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); } @@ -3095,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, cachep->ctor(objp); if (ARCH_SLAB_MINALIGN && ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { - pr_err("0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", + pr_err("0x%px: not aligned to ARCH_SLAB_MINALIGN=%d\n", objp, (int)ARCH_SLAB_MINALIGN); } return objp; @@ -4293,7 +4290,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) return; } #endif - seq_printf(m, "%p", (void *)address); + seq_printf(m, "%px", (void *)address); } static int leaks_show(struct seq_file *m, void *p) From 47e67b43c29595393ce8e6ba8c5e73558c19bc57 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 19 Dec 2017 13:52:23 -0800 Subject: [PATCH 119/134] UPSTREAM: Do not hash userspace addresses in fault handlers The hashing of %p was designed to restrict kernel addresses. There is no reason to hash the userspace values seen during a segfault report, so switch these to %px. (Some architectures already use %lx.) Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p") Signed-off-by: Kees Cook Signed-off-by: Linus Torvalds (cherry picked from commit 10a7e9d849150a2879efc0b04d8a51068c9dd0c5) Signed-off-by: Sandeep Patil Bug: 78533979 Test: Build and boot cuttlefish Change-Id: Ibd04bf839ec7c86884ad2366324bf9cd69d05f34 --- arch/sparc/mm/fault_32.c | 2 +- arch/sparc/mm/fault_64.c | 2 +- arch/um/kernel/trap.c | 2 +- arch/x86/mm/fault.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index be3136f142a9..a8103a84b4ac 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -113,7 +113,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code, if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x", + printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->pc, (void *)regs->u_regs[UREG_I7], diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 815c03d7a765..41363f46797b 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -154,7 +154,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code, if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x", + printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->tpc, (void *)regs->u_regs[UREG_I7], diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 4e6fcb32620f..428644175956 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -150,7 +150,7 @@ static void show_segv_info(struct uml_pt_regs *regs) if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p sp %p error %x", + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi), (void *)UPT_IP(regs), (void *)UPT_SP(regs), diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 794c35c4ca73..99a141d500dc 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -860,7 +860,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); From aefbab3ee1c5dcaa1536bc478b63c96a720b6436 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 2 Jan 2018 12:15:53 -0800 Subject: [PATCH 120/134] UPSTREAM: usercopy: Remove pointer from overflow report Using %p was already mostly useless in the usercopy overflow reports, so this removes it entirely to avoid confusion now that %p-hashing is enabled. Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p") Signed-off-by: Kees Cook (cherry picked from commit 4f5e838605c264fcf16c3ff9495bd83da99acc6a) Signed-off-by: Sandeep Patil Bug: 78533979 Test: Build and boot cuttlefish Change-Id: I6818c6a678844f174f4ac7ca08b945ec7ce229d5 --- mm/usercopy.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index a9852b24715d..5df1e68d4585 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -58,12 +58,11 @@ static noinline int check_stack_object(const void *obj, unsigned long len) return GOOD_STACK; } -static void report_usercopy(const void *ptr, unsigned long len, - bool to_user, const char *type) +static void report_usercopy(unsigned long len, bool to_user, const char *type) { - pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n", + pr_emerg("kernel memory %s attempt detected %s '%s' (%lu bytes)\n", to_user ? "exposure" : "overwrite", - to_user ? "from" : "to", ptr, type ? : "unknown", len); + to_user ? "from" : "to", type ? : "unknown", len); /* * For greater effect, it would be nice to do do_group_exit(), * but BUG() actually hooks all the lock-breaking and per-arch @@ -261,6 +260,6 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) return; report: - report_usercopy(ptr, n, to_user, err); + report_usercopy(n, to_user, err); } EXPORT_SYMBOL(__check_object_size); From 3d3f9956c2ba0da4470e2554989dd51780704086 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Sat, 6 Jan 2018 11:12:46 +0530 Subject: [PATCH 121/134] UPSTREAM: trace_uprobe: Display correct offset in uprobe_events Recently, how the pointers being printed with %p has been changed by commit ad67b74d2469 ("printk: hash addresses printed with %p"). This is causing a regression while showing offset in the uprobe_events file. Instead of %p, use %px to display offset. Before patch: # perf probe -vv -x /tmp/a.out main Opening /sys/kernel/debug/tracing//uprobe_events write=1 Writing event: p:probe_a/main /tmp/a.out:0x58c # cat /sys/kernel/debug/tracing/uprobe_events p:probe_a/main /tmp/a.out:0x0000000049a0f352 After patch: # cat /sys/kernel/debug/tracing/uprobe_events p:probe_a/main /tmp/a.out:0x000000000000058c Link: http://lkml.kernel.org/r/20180106054246.15375-1-ravi.bangoria@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: ad67b74d2469 ("printk: hash addresses printed with %p") Acked-by: Srikar Dronamraju Signed-off-by: Ravi Bangoria Signed-off-by: Steven Rostedt (VMware) (cherry picked from commit 0e4d819d0893dc043ea7b7cb6baf4be1e310bd96) Signed-off-by: Sandeep Patil Bug: 78533979 Test: Build and boot cuttlefish Change-Id: Iede3934b79b35063691f79abaf2d2b45e4a68cf8 --- kernel/trace/trace_uprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index fdf2ea4d64ec..10d13fdcabf4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -611,7 +611,7 @@ static int probes_seq_show(struct seq_file *m, void *v) /* Don't print "0x (null)" when offset is 0 */ if (tu->offset) { - seq_printf(m, "0x%p", (void *)tu->offset); + seq_printf(m, "0x%px", (void *)tu->offset); } else { switch (sizeof(void *)) { case 4: From ef5c8849337f8fef1a46bcb0f2bdc8c0cde28482 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 26 Jan 2018 13:11:36 +0100 Subject: [PATCH 122/134] UPSTREAM: x86/alternative: Print unadorned pointers After commit ad67b74d2469 ("printk: hash addresses printed with %p") pointers are being hashed when printed. However, this makes the alternative debug output completely useless. Switch to %px in order to see the unadorned kernel pointers. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: riel@redhat.com Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: David Woodhouse Cc: jikos@kernel.org Cc: luto@amacapital.net Cc: dave.hansen@intel.com Cc: torvalds@linux-foundation.org Cc: keescook@google.com Cc: Josh Poimboeuf Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Cc: pjt@google.com Link: https://lkml.kernel.org/r/20180126121139.31959-2-bp@alien8.de (cherry picked from commit 0e6c16c652cadaffd25a6bb326ec10da5bcec6b4) Signed-off-by: Sandeep Patil Bug: 78533979 Test: Build and boot cuttlefish Change-Id: Ifd8e85564bce6530201ee1a8ea032a8ac2e65aee --- arch/x86/kernel/alternative.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index b034826a0b3b..21be0193d9dc 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -287,7 +287,7 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) tgt_rip = next_rip + o_dspl; n_dspl = tgt_rip - orig_insn; - DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl); + DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl); if (tgt_rip - orig_insn >= 0) { if (n_dspl - 2 <= 127) @@ -344,7 +344,7 @@ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *ins add_nops(instr + (a->instrlen - a->padlen), a->padlen); local_irq_restore(flags); - DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", + DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ", instr, a->instrlen - a->padlen, a->padlen); } @@ -365,7 +365,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, u8 *instr, *replacement; u8 insnbuf[MAX_PATCH_LEN]; - DPRINTK("alt table %p -> %p", start, end); + DPRINTK("alt table %px, -> %px", start, end); /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. @@ -389,14 +389,14 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, continue; } - DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d", + DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d", a->cpuid >> 5, a->cpuid & 0x1f, instr, a->instrlen, replacement, a->replacementlen, a->padlen); - DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr); - DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement); + DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); memcpy(insnbuf, replacement, a->replacementlen); insnbuf_sz = a->replacementlen; @@ -422,7 +422,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, a->instrlen - a->replacementlen); insnbuf_sz += a->instrlen - a->replacementlen; } - DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr); + DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr); text_poke_early(instr, insnbuf, insnbuf_sz); } From e91d6a2aff1724d3f3a840b573f6a2a0ea290539 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 4 Jan 2018 16:17:59 -0800 Subject: [PATCH 123/134] BACKPORT: mm/debug.c: provide useful debugging information for VM_BUG With the recent addition of hashed kernel pointers, places which need to produce useful debug output have to specify %px, not %p. This patch fixes all the VM debug to use %px. This is appropriate because it's debug output that the user should never be able to trigger, and kernel developers need to see the actual pointers. Link: http://lkml.kernel.org/r/20171219133236.GE13680@bombadil.infradead.org Signed-off-by: Matthew Wilcox Acked-by: Michal Hocko Cc: "Tobin C. Harding" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 152a2d199e1385c6ccef17c24555103b30447c91) Signed-off-by: Sandeep Patil Conflicts: mm/debug.c Bug: 124090075 Test: Build and boot cuttlefish Change-Id: I280ec3e95c64904576d65ab3b8b90330da54416b --- mm/debug.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/debug.c b/mm/debug.c index c55abc893fdc..97609290dd51 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -50,7 +50,7 @@ void __dump_page(struct page *page, const char *reason) */ int mapcount = PageSlab(page) ? 0 : page_mapcount(page); - pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", + pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx", page, page_ref_count(page), mapcount, page->mapping, page_to_pgoff(page)); if (PageCompound(page)) @@ -69,7 +69,7 @@ void __dump_page(struct page *page, const char *reason) #ifdef CONFIG_MEMCG if (page->mem_cgroup) - pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); + pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup); #endif } @@ -84,10 +84,10 @@ EXPORT_SYMBOL(dump_page); void dump_vma(const struct vm_area_struct *vma) { - pr_emerg("vma %p start %p end %p\n" - "next %p prev %p mm %p\n" - "prot %lx anon_vma %p vm_ops %p\n" - "pgoff %lx file %p private_data %p\n" + pr_emerg("vma %px start %px end %px\n" + "next %px prev %px mm %px\n" + "prot %lx anon_vma %px vm_ops %px\n" + "pgoff %lx file %px private_data %px\n" "flags: %#lx(%pGv)\n", vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, vma->vm_prev, vma->vm_mm, @@ -100,27 +100,27 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { - pr_emerg("mm %p mmap %p seqnum %llu task_size %lu\n" + pr_emerg("mm %px mmap %px seqnum %llu task_size %lu\n" #ifdef CONFIG_MMU - "get_unmapped_area %p\n" + "get_unmapped_area %px\n" #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" - "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" + "pgd %px mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" - "binfmt %p flags %lx core_state %p\n" + "binfmt %px flags %lx core_state %px\n" #ifdef CONFIG_AIO - "ioctx_table %p\n" + "ioctx_table %px\n" #endif #ifdef CONFIG_MEMCG - "owner %p " + "owner %px " #endif - "exe_file %p\n" + "exe_file %px\n" #ifdef CONFIG_MMU_NOTIFIER - "mmu_notifier_mm %p\n" + "mmu_notifier_mm %px\n" #endif #ifdef CONFIG_NUMA_BALANCING "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n" From 317336de579f9ffd608709728732b064e78df403 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Thu, 28 Mar 2019 17:52:21 -0700 Subject: [PATCH 124/134] ANDROID: Remove Android paranoid check for socket creation For 4.14+ kernels, eBPF cgroup socket filter is used to control socket creation on devices. Remove this check since it is no longer useful. Signed-off-by: Chenbo Feng Bug: 128944261 Test: CtsNetTestCasesInternetPermission Change-Id: I2f353663389fc0f992e5a1b424c12215a2b074b0 --- net/ipv4/af_inet.c | 17 ----------------- net/ipv6/af_inet6.c | 17 ----------------- 2 files changed, 34 deletions(-) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8d39fc3c083f..be2ba4da2197 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -121,20 +121,6 @@ #endif #include -#ifdef CONFIG_ANDROID_PARANOID_NETWORK -#include - -static inline int current_has_network(void) -{ - return in_egroup_p(AID_INET) || capable(CAP_NET_RAW); -} -#else -static inline int current_has_network(void) -{ - return 1; -} -#endif - /* The inetsw table contains everything that inet_create needs to * build a new socket. */ @@ -268,9 +254,6 @@ static int inet_create(struct net *net, struct socket *sock, int protocol, if (protocol < 0 || protocol >= IPPROTO_MAX) return -EINVAL; - if (!current_has_network()) - return -EACCES; - sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 70b392d2bb07..fda5fae57b83 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -66,20 +66,6 @@ #include #include -#ifdef CONFIG_ANDROID_PARANOID_NETWORK -#include - -static inline int current_has_network(void) -{ - return in_egroup_p(AID_INET) || capable(CAP_NET_RAW); -} -#else -static inline int current_has_network(void) -{ - return 1; -} -#endif - #include "ip6_offload.h" MODULE_AUTHOR("Cast of dozens"); @@ -136,9 +122,6 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol, if (protocol < 0 || protocol >= IPPROTO_MAX) return -EINVAL; - if (!current_has_network()) - return -EACCES; - /* Look for the requested type/protocol pair. */ lookup_protocol: err = -ESOCKTNOSUPPORT; From d5e44236b2ecc206b2f61caec8d2849c1b2d7ad9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Mar 2019 11:44:18 -0700 Subject: [PATCH 125/134] UPSTREAM: filemap: pass vm_fault to the mmap ra helpers All of the arguments to these functions come from the vmf. Cut down on the amount of arguments passed by simply passing in the vmf to these two helpers. Bug: 124328118 Change-Id: I5029f277431aaa50ee58df185fa13d7c58b3d774 Link: http://lkml.kernel.org/r/20181211173801.29535-3-josef@toxicpanda.com Signed-off-by: Josef Bacik Reviewed-by: Andrew Morton Reviewed-by: Jan Kara Cc: Dave Chinner Cc: Johannes Weiner Cc: Rik van Riel Cc: Tejun Heo Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 2a1180f1bd389e9d47693e5eb384b95f482d8d19) Signed-off-by: Minchan Kim --- mm/filemap.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index a51b4979374d..52122dd90dbc 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2321,20 +2321,20 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) * Synchronous readahead happens when we don't even find * a page in the page cache at all. */ -static void do_sync_mmap_readahead(struct vm_area_struct *vma, - struct file_ra_state *ra, - struct file *file, - pgoff_t offset) +static void do_sync_mmap_readahead(struct vm_fault *vmf) { + struct file *file = vmf->vma->vm_file; + struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ - if (vma->vm_flags & VM_RAND_READ) + if (vmf->vma->vm_flags & VM_RAND_READ) return; if (!ra->ra_pages) return; - if (vma->vm_flags & VM_SEQ_READ) { + if (vmf->vma->vm_flags & VM_SEQ_READ) { page_cache_sync_readahead(mapping, ra, file, offset, ra->ra_pages); return; @@ -2364,16 +2364,16 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, * Asynchronous readahead happens when we find the page and PG_readahead, * so we want to possibly extend the readahead further.. */ -static void do_async_mmap_readahead(struct vm_area_struct *vma, - struct file_ra_state *ra, - struct file *file, - struct page *page, - pgoff_t offset) +static void do_async_mmap_readahead(struct vm_fault *vmf, + struct page *page) { + struct file *file = vmf->vma->vm_file; + struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ - if (vma->vm_flags & VM_RAND_READ) + if (vmf->vma->vm_flags & VM_RAND_READ) return; if (ra->mmap_miss > 0) ra->mmap_miss--; @@ -2430,10 +2430,10 @@ int filemap_fault(struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ - do_async_mmap_readahead(vmf->vma, ra, file, page, offset); + do_async_mmap_readahead(vmf, page); } else if (!page) { /* No page in the page cache at all */ - do_sync_mmap_readahead(vmf->vma, ra, file, offset); + do_sync_mmap_readahead(vmf); count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; From 1382500f2ef69859cd115627a435add87f301935 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Mar 2019 11:44:14 -0700 Subject: [PATCH 126/134] BACKPORT: filemap: kill page_cache_read usage in filemap_fault Patch series "drop the mmap_sem when doing IO in the fault path", v6. Now that we have proper isolation in place with cgroups2 we have started going through and fixing the various priority inversions. Most are all gone now, but this one is sort of weird since it's not necessarily a priority inversion that happens within the kernel, but rather because of something userspace does. We have giant applications that we want to protect, and parts of these giant applications do things like watch the system state to determine how healthy the box is for load balancing and such. This involves running 'ps' or other such utilities. These utilities will often walk /proc//whatever, and these files can sometimes need to down_read(&task->mmap_sem). Not usually a big deal, but we noticed when we are stress testing that sometimes our protected application has latency spikes trying to get the mmap_sem for tasks that are in lower priority cgroups. This is because any down_write() on a semaphore essentially turns it into a mutex, so even if we currently have it held for reading, any new readers will not be allowed on to keep from starving the writer. This is fine, except a lower priority task could be stuck doing IO because it has been throttled to the point that its IO is taking much longer than normal. But because a higher priority group depends on this completing it is now stuck behind lower priority work. In order to avoid this particular priority inversion we want to use the existing retry mechanism to stop from holding the mmap_sem at all if we are going to do IO. This already exists in the read case sort of, but needed to be extended for more than just grabbing the page lock. With io.latency we throttle at submit_bio() time, so the readahead stuff can block and even page_cache_read can block, so all these paths need to have the mmap_sem dropped. The other big thing is ->page_mkwrite. btrfs is particularly shitty here because we have to reserve space for the dirty page, which can be a very expensive operation. We use the same retry method as the read path, and simply cache the page and verify the page is still setup properly the next pass through ->page_mkwrite(). I've tested these patches with xfstests and there are no regressions. This patch (of 3): If we do not have a page at filemap_fault time we'll do this weird forced page_cache_read thing to populate the page, and then drop it again and loop around and find it. This makes for 2 ways we can read a page in filemap_fault, and it's not really needed. Instead add a FGP_FOR_MMAP flag so that pagecache_get_page() will return a unlocked page that's in pagecache. Then use the normal page locking and readpage logic already in filemap_fault. This simplifies the no page in page cache case significantly. Bug: 124328118 Change-Id: I89088ef20a5cf5468e358064fc0bc85fcc6e3fc2 [akpm@linux-foundation.org: fix comment text] [josef@toxicpanda.com: don't unlock null page in FGP_FOR_MMAP case] Link: http://lkml.kernel.org/r/20190312201742.22935-1-josef@toxicpanda.com Link: http://lkml.kernel.org/r/20181211173801.29535-2-josef@toxicpanda.com Signed-off-by: Josef Bacik Acked-by: Johannes Weiner Reviewed-by: Jan Kara Reviewed-by: Andrew Morton Cc: Tejun Heo Cc: Dave Chinner Cc: Rik van Riel Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit a75d4c33377277b6034dd1e2663bce444f952c14) Signed-off-by: Minchan Kim --- include/linux/pagemap.h | 1 + mm/filemap.c | 77 +++++++++-------------------------------- 2 files changed, 17 insertions(+), 61 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 467af15187c0..cdfe171c0106 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -256,6 +256,7 @@ pgoff_t page_cache_prev_hole(struct address_space *mapping, #define FGP_WRITE 0x00000008 #define FGP_NOFS 0x00000010 #define FGP_NOWAIT 0x00000020 +#define FGP_FOR_MMAP 0x00000040 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, int fgp_flags, gfp_t cache_gfp_mask); diff --git a/mm/filemap.c b/mm/filemap.c index 52122dd90dbc..8dab03da4f87 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1439,7 +1439,10 @@ EXPORT_SYMBOL(find_lock_entry); * - FGP_CREAT: If page is not present then a new page is allocated using * @gfp_mask and added to the page cache and the VM's LRU * list. The page is returned locked and with an increased - * refcount. Otherwise, NULL is returned. + * refcount. + * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do + * its own locking dance if the page is already in cache, or unlock the page + * before returning if we had to add the page to pagecache. * * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even * if the GFP flags specified for FGP_CREAT are atomic. @@ -1492,7 +1495,7 @@ no_page: if (!page) return NULL; - if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) + if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; /* Init accessed so avoid atomic mark_page_accessed later */ @@ -1506,6 +1509,13 @@ no_page: if (err == -EEXIST) goto repeat; } + + /* + * add_to_page_cache_lru locks the page, and for mmap we expect + * an unlocked page. + */ + if (page && (fgp_flags & FGP_FOR_MMAP)) + unlock_page(page); } return page; @@ -2282,39 +2292,6 @@ out: EXPORT_SYMBOL(generic_file_read_iter); #ifdef CONFIG_MMU -/** - * page_cache_read - adds requested page to the page cache if not already there - * @file: file to read - * @offset: page index - * @gfp_mask: memory allocation flags - * - * This adds the requested page to the page cache if it isn't already there, - * and schedules an I/O to read in its contents from disk. - */ -static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) -{ - struct address_space *mapping = file->f_mapping; - struct page *page; - int ret; - - do { - page = __page_cache_alloc(gfp_mask|__GFP_COLD); - if (!page) - return -ENOMEM; - - ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask); - if (ret == 0) - ret = mapping->a_ops->readpage(file, page); - else if (ret == -EEXIST) - ret = 0; /* losing race to add is OK */ - - put_page(page); - - } while (ret == AOP_TRUNCATED_PAGE); - - return ret; -} - #define MMAP_LOTSAMISS (100) /* @@ -2438,9 +2415,11 @@ int filemap_fault(struct vm_fault *vmf) count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: - page = find_get_page(mapping, offset); + page = pagecache_get_page(mapping, offset, + FGP_CREAT|FGP_FOR_MMAP, + vmf->gfp_mask); if (!page) - goto no_cached_page; + return VM_FAULT_OOM; } if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) { @@ -2477,30 +2456,6 @@ retry_find: vmf->page = page; return ret | VM_FAULT_LOCKED; -no_cached_page: - /* - * We're only likely to ever get here if MADV_RANDOM is in - * effect. - */ - error = page_cache_read(file, offset, vmf->gfp_mask); - - /* - * The page we want has now been added to the page cache. - * In the unlikely event that someone removed it in the - * meantime, we'll just come back here and read it again. - */ - if (error >= 0) - goto retry_find; - - /* - * An error return from page_cache_read can result if the - * system is low on memory, or a problem occurs while trying - * to schedule I/O. - */ - if (error == -ENOMEM) - return VM_FAULT_OOM; - return VM_FAULT_SIGBUS; - page_not_uptodate: /* * Umm, take care of errors if the page isn't up-to-date. From ce3ca1406bce18270c605f475b648f2e24be1e81 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Mar 2019 11:44:22 -0700 Subject: [PATCH 127/134] BACKPORT: filemap: drop the mmap_sem for all blocking operations Currently we only drop the mmap_sem if there is contention on the page lock. The idea is that we issue readahead and then go to lock the page while it is under IO and we want to not hold the mmap_sem during the IO. The problem with this is the assumption that the readahead does anything. In the case that the box is under extreme memory or IO pressure we may end up not reading anything at all for readahead, which means we will end up reading in the page under the mmap_sem. Even if the readahead does something, it could get throttled because of io pressure on the system and the process is in a lower priority cgroup. Holding the mmap_sem while doing IO is problematic because it can cause system-wide priority inversions. Consider some large company that does a lot of web traffic. This large company has load balancing logic in it's core web server, cause some engineer thought this was a brilliant plan. This load balancing logic gets statistics from /proc about the system, which trip over processes mmap_sem for various reasons. Now the web server application is in a protected cgroup, but these other processes may not be, and if they are being throttled while their mmap_sem is held we'll stall, and cause this nice death spiral. Instead rework filemap fault path to drop the mmap sem at any point that we may do IO or block for an extended period of time. This includes while issuing readahead, locking the page, or needing to call ->readpage because readahead did not occur. Then once we have a fully uptodate page we can return with VM_FAULT_RETRY and come back again to find our nicely in-cache page that was gotten outside of the mmap_sem. This patch also adds a new helper for locking the page with the mmap_sem dropped. This doesn't make sense currently as generally speaking if the page is already locked it'll have been read in (unless there was an error) before it was unlocked. However a forthcoming patchset will change this with the ability to abort read-ahead bio's if necessary, making it more likely that we could contend for a page lock and still have a not uptodate page. This allows us to deal with this case by grabbing the lock and issuing the IO without the mmap_sem held, and then returning VM_FAULT_RETRY to come back around. Test: fio mmap ioengine with 8-thread on 4cpu under memory pressure for stability check Test: locking holding latency measure with lockstat for performance check Bug: 124328118 Change-Id: I18368917b23fecc592e8e4a65b0f3f5576218e45 [josef@toxicpanda.com: v6] Link: http://lkml.kernel.org/r/20181212152757.10017-1-josef@toxicpanda.com [kirill@shutemov.name: fix race in filemap_fault()] Link: http://lkml.kernel.org/r/20181228235106.okk3oastsnpxusxs@kshutemo-mobl1 [akpm@linux-foundation.org: coding style fixes] Link: http://lkml.kernel.org/r/20181211173801.29535-4-josef@toxicpanda.com Signed-off-by: Josef Bacik Acked-by: Johannes Weiner Reviewed-by: Andrew Morton Reviewed-by: Jan Kara Tested-by: syzbot+b437b5a429d680cf2217@syzkaller.appspotmail.com Cc: Dave Chinner Cc: Rik van Riel Cc: Tejun Heo Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 6b4c9f4469819a0c1a38a0a4541337e0f9bf6c11) Signed-off-by: Minchan Kim --- mm/filemap.c | 136 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 117 insertions(+), 19 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 8dab03da4f87..8364c77ea8fe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2293,28 +2293,92 @@ EXPORT_SYMBOL(generic_file_read_iter); #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) +static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, + struct file *fpin) +{ + int flags = vmf->flags; + + if (fpin) + return fpin; + + /* + * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or + * anything, so we only pin the file and drop the mmap_sem if only + * FAULT_FLAG_ALLOW_RETRY is set. + */ + if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == + FAULT_FLAG_ALLOW_RETRY) { + fpin = get_file(vmf->vma->vm_file); + up_read(&vmf->vma->vm_mm->mmap_sem); + } + return fpin; +} + +/* + * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem + * @vmf - the vm_fault for this fault. + * @page - the page to lock. + * @fpin - the pointer to the file we may pin (or is already pinned). + * + * This works similar to lock_page_or_retry in that it can drop the mmap_sem. + * It differs in that it actually returns the page locked if it returns 1 and 0 + * if it couldn't lock the page. If we did have to drop the mmap_sem then fpin + * will point to the pinned file and needs to be fput()'ed at a later point. + */ +static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, + struct file **fpin) +{ + if (trylock_page(page)) + return 1; + + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) + return 0; + + *fpin = maybe_unlock_mmap_for_io(vmf, *fpin); + if (vmf->flags & FAULT_FLAG_KILLABLE) { + if (__lock_page_killable(page)) { + /* + * We didn't have the right flags to drop the mmap_sem, + * but all fault_handlers only check for fatal signals + * if we return VM_FAULT_RETRY, so we need to drop the + * mmap_sem here and return 0 if we don't have a fpin. + */ + if (*fpin == NULL) + up_read(&vmf->vma->vm_mm->mmap_sem); + return 0; + } + } else + __lock_page(page); + return 1; +} + /* - * Synchronous readahead happens when we don't even find - * a page in the page cache at all. + * Synchronous readahead happens when we don't even find a page in the page + * cache at all. We don't want to perform IO under the mmap sem, so if we have + * to drop the mmap sem we return the file that was pinned in order for us to do + * that. If we didn't pin a file then we return NULL. The file that is + * returned needs to be fput()'ed when we're done with it. */ -static void do_sync_mmap_readahead(struct vm_fault *vmf) +static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + struct file *fpin = NULL; pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ) - return; + return fpin; if (!ra->ra_pages) - return; + return fpin; if (vmf->vma->vm_flags & VM_SEQ_READ) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_sync_readahead(mapping, ra, file, offset, ra->ra_pages); - return; + return fpin; } /* Avoid banging the cache line if not needed */ @@ -2326,37 +2390,44 @@ static void do_sync_mmap_readahead(struct vm_fault *vmf) * stop bothering with read-ahead. It will only hurt. */ if (ra->mmap_miss > MMAP_LOTSAMISS) - return; + return fpin; /* * mmap read-around */ + fpin = maybe_unlock_mmap_for_io(vmf, fpin); ra->start = max_t(long, 0, offset - ra->ra_pages / 2); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; ra_submit(ra, mapping, file); + return fpin; } /* * Asynchronous readahead happens when we find the page and PG_readahead, - * so we want to possibly extend the readahead further.. + * so we want to possibly extend the readahead further. We return the file that + * was pinned if we have to drop the mmap_sem in order to do IO. */ -static void do_async_mmap_readahead(struct vm_fault *vmf, - struct page *page) +static struct file *do_async_mmap_readahead(struct vm_fault *vmf, + struct page *page) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + struct file *fpin = NULL; pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ) - return; + return fpin; if (ra->mmap_miss > 0) ra->mmap_miss--; - if (PageReadahead(page)) + if (PageReadahead(page)) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_async_readahead(mapping, ra, file, page, offset, ra->ra_pages); + } + return fpin; } /** @@ -2386,6 +2457,7 @@ int filemap_fault(struct vm_fault *vmf) { int error; struct file *file = vmf->vma->vm_file; + struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; @@ -2407,25 +2479,26 @@ int filemap_fault(struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ - do_async_mmap_readahead(vmf, page); + fpin = do_async_mmap_readahead(vmf, page); } else if (!page) { /* No page in the page cache at all */ - do_sync_mmap_readahead(vmf); count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; + fpin = do_sync_mmap_readahead(vmf); retry_find: page = pagecache_get_page(mapping, offset, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask); - if (!page) + if (!page) { + if (fpin) + goto out_retry; return VM_FAULT_OOM; + } } - if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) { - put_page(page); - return ret | VM_FAULT_RETRY; - } + if (!lock_page_maybe_drop_mmap(vmf, page, &fpin)) + goto out_retry; /* Did it get truncated? */ if (unlikely(page->mapping != mapping)) { @@ -2442,6 +2515,16 @@ retry_find: if (unlikely(!PageUptodate(page))) goto page_not_uptodate; + /* + * We've made it this far and we had to drop our mmap_sem, now is the + * time to return to the upper layer and have it re-find the vma and + * redo the fault. + */ + if (fpin) { + unlock_page(page); + goto out_retry; + } + /* * Found the page and have a reference on it. * We must recheck i_size under page lock. @@ -2464,12 +2547,15 @@ page_not_uptodate: * and we need to check for errors. */ ClearPageError(page); + fpin = maybe_unlock_mmap_for_io(vmf, fpin); error = mapping->a_ops->readpage(file, page); if (!error) { wait_on_page_locked(page); if (!PageUptodate(page)) error = -EIO; } + if (fpin) + goto out_retry; put_page(page); if (!error || error == AOP_TRUNCATED_PAGE) @@ -2478,6 +2564,18 @@ page_not_uptodate: /* Things didn't work out. Return zero to tell the mm layer so. */ shrink_readahead_size_eio(file, ra); return VM_FAULT_SIGBUS; + +out_retry: + /* + * We dropped the mmap_sem, we need to return to the fault handler to + * re-find the vma and come back and find our hopefully still populated + * page. + */ + if (page) + put_page(page); + if (fpin) + fput(fpin); + return ret | VM_FAULT_RETRY; } EXPORT_SYMBOL(filemap_fault); From 403d0082901ced8df24a09383ab45a0476264118 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 15 Mar 2019 11:26:07 -0700 Subject: [PATCH 128/134] UPSTREAM: filemap: add a comment about FAULT_FLAG_RETRY_NOWAIT behavior I thought Josef Bacik's patch to drop the mmap_sem was buggy, because when looking at the error cases, there was one case where we returned VM_FAULT_RETRY without actually dropping the mmap_sem. Josef had to explain to me (using small words) that yes, that's actually what we're supposed to do, and his patch was correct. Which not only convinced me he knew what he was doing and I should stop arguing with him, but also that I should add a comment to the case I was confused about. Bug: 124328118 Change-Id: I968005b3673da3adad843c2660111e9a70f98c26 Patiently-pointed-out-by: Josef Bacik Signed-off-by: Linus Torvalds (cherry picked from commit 8b0f9fa2e02dc95216577c3387b0707c5f60fbaf) Signed-off-by: Minchan Kim --- mm/filemap.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/filemap.c b/mm/filemap.c index 8364c77ea8fe..d290da2f6af0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2331,6 +2331,11 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, if (trylock_page(page)) return 1; + /* + * NOTE! This will make us return with VM_FAULT_RETRY, but with + * the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT + * is supposed to work. We have way too many special cases.. + */ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) return 0; From 7dac4a1d34acc1030b2b0404d383d159f59f849f Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Fri, 29 Mar 2019 14:23:43 -0700 Subject: [PATCH 129/134] ANDROID: drop CONFIG_INPUT_KEYCHORD from cuttlefish Remove keychord driver, replaced in user space by https://android-review.googlesource.com/c/677629. Signed-off-by: Mark Salyzyn Bug: 129556081 Change-Id: Ie8a2b9977a21022c204a19f1a8d781ea5a23c656 --- arch/x86/configs/x86_64_cuttlefish_defconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig index 9c542d0eb2a8..e101b10a4d72 100644 --- a/arch/x86/configs/x86_64_cuttlefish_defconfig +++ b/arch/x86/configs/x86_64_cuttlefish_defconfig @@ -284,7 +284,6 @@ CONFIG_TABLET_USB_GTCO=y CONFIG_TABLET_USB_HANWANG=y CONFIG_TABLET_USB_KBTAB=y CONFIG_INPUT_MISC=y -CONFIG_INPUT_KEYCHORD=y CONFIG_INPUT_UINPUT=y CONFIG_INPUT_GPIO=y # CONFIG_SERIO_I8042 is not set From 3909bbc1d6ca12b489787e8b526880e46765ec9a Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Wed, 9 May 2018 12:17:36 -0700 Subject: [PATCH 130/134] Revert "ANDROID: input: keychord: Fix for a memory leak in keychord." This reverts commit 29be5c0c2ef587e7958ef16cc8357e8880e92cd9. Remove keychord driver, replaced in user space by https://android-review.googlesource.com/c/677629. Signed-off-by: Mark Salyzyn Bug: 64114943 Bug: 64483974 Bug: 129556081 Change-Id: I4191a02aa70f3c4eb517b9a0ec092380b90130b4 --- drivers/input/misc/keychord.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c index 791f285b0c13..d679360ded56 100644 --- a/drivers/input/misc/keychord.c +++ b/drivers/input/misc/keychord.c @@ -432,7 +432,6 @@ static int keychord_release(struct inode *inode, struct file *file) if (kdev->registered) input_unregister_handler(&kdev->input_handler); - kfree(kdev->keychords); kfree(kdev); return 0; From 24ff05e74ac760f585ec251826b60dacda7cf4d6 Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Wed, 9 May 2018 12:19:32 -0700 Subject: [PATCH 131/134] Revert "ANDROID: input: keychord: Fix races in keychord_write." This reverts commit 8253552b9900cf4497e1c7d15a83ddc5995abcd8. Remove keychord driver, replaced in user space by https://android-review.googlesource.com/c/677629. Signed-off-by: Mark Salyzyn Bug: 64114943 Bug: 64133562 Bug: 63974334 Bug: 129556081 Change-Id: Ie94621b0adf8b1f8c0d249f74385cc2914b1aec0 --- drivers/input/misc/keychord.c | 61 +---------------------------------- 1 file changed, 1 insertion(+), 60 deletions(-) diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c index d679360ded56..4e0f7daeada6 100644 --- a/drivers/input/misc/keychord.c +++ b/drivers/input/misc/keychord.c @@ -60,10 +60,6 @@ struct keychord_device { unsigned char head; unsigned char tail; __u16 buff[BUFFER_SIZE]; - /* Bit to serialize writes to this device */ -#define KEYCHORD_BUSY 0x01 - unsigned long flags; - wait_queue_head_t write_waitq; }; static int check_keychord(struct keychord_device *kdev, @@ -176,6 +172,7 @@ static int keychord_connect(struct input_handler *handler, goto err_input_open_device; pr_info("keychord: using input dev %s for fevent\n", dev->name); + return 0; err_input_open_device: @@ -227,41 +224,6 @@ static ssize_t keychord_read(struct file *file, char __user *buffer, return count; } -/* - * serializes writes on a device. can use mutex_lock_interruptible() - * for this particular use case as well - a matter of preference. - */ -static int -keychord_write_lock(struct keychord_device *kdev) -{ - int ret; - unsigned long flags; - - spin_lock_irqsave(&kdev->lock, flags); - while (kdev->flags & KEYCHORD_BUSY) { - spin_unlock_irqrestore(&kdev->lock, flags); - ret = wait_event_interruptible(kdev->write_waitq, - ((kdev->flags & KEYCHORD_BUSY) == 0)); - if (ret) - return ret; - spin_lock_irqsave(&kdev->lock, flags); - } - kdev->flags |= KEYCHORD_BUSY; - spin_unlock_irqrestore(&kdev->lock, flags); - return 0; -} - -static void -keychord_write_unlock(struct keychord_device *kdev) -{ - unsigned long flags; - - spin_lock_irqsave(&kdev->lock, flags); - kdev->flags &= ~KEYCHORD_BUSY; - spin_unlock_irqrestore(&kdev->lock, flags); - wake_up_interruptible(&kdev->write_waitq); -} - /* * keychord_write is used to configure the driver */ @@ -288,22 +250,6 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer, return -EFAULT; } - /* - * Serialize writes to this device to prevent various races. - * 1) writers racing here could do duplicate input_unregister_handler() - * calls, resulting in attempting to unlink a node from a list that - * does not exist. - * 2) writers racing here could do duplicate input_register_handler() calls - * below, resulting in a duplicate insertion of a node into the list. - * 3) a double kfree of keychords can occur (in the event that - * input_register_handler() fails below. - */ - ret = keychord_write_lock(kdev); - if (ret) { - kfree(keychords); - return ret; - } - /* unregister handler before changing configuration */ if (kdev->registered) { input_unregister_handler(&kdev->input_handler); @@ -372,19 +318,15 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer, if (ret) { kfree(keychords); kdev->keychords = 0; - keychord_write_unlock(kdev); return ret; } kdev->registered = 1; - keychord_write_unlock(kdev); - return count; err_unlock_return: spin_unlock_irqrestore(&kdev->lock, flags); kfree(keychords); - keychord_write_unlock(kdev); return -EINVAL; } @@ -410,7 +352,6 @@ static int keychord_open(struct inode *inode, struct file *file) spin_lock_init(&kdev->lock); init_waitqueue_head(&kdev->waitq); - init_waitqueue_head(&kdev->write_waitq); kdev->input_handler.event = keychord_event; kdev->input_handler.connect = keychord_connect; From 95f51a3d9cba14f14d3846ccc113cf24b61adf69 Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Wed, 9 May 2018 12:23:28 -0700 Subject: [PATCH 132/134] Revert "ANDROID: input: keychord: Fix a slab out-of-bounds read." This reverts commit 92fc7f9aa0298cc112b2893e4e0bcf522f2659a8. Remove keychord driver, replaced in user space by https://android-review.googlesource.com/c/677629. Signed-off-by: Mark Salyzyn Cc: Amit Pundir Bug: 64114943 Bug: 63962952 Bug: 129556081 Change-Id: I0a652b72b0ee62974c408ffb0987cc2ef9e346c1 --- drivers/input/misc/keychord.c | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c index 4e0f7daeada6..8ed8a2738ea1 100644 --- a/drivers/input/misc/keychord.c +++ b/drivers/input/misc/keychord.c @@ -232,11 +232,9 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer, { struct keychord_device *kdev = file->private_data; struct input_keychord *keychords = 0; - struct input_keychord *keychord; + struct input_keychord *keychord, *next, *end; int ret, i, key; unsigned long flags; - size_t resid = count; - size_t key_bytes; if (count < sizeof(struct input_keychord) || count > PAGE_SIZE) return -EINVAL; @@ -267,29 +265,15 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer, kdev->head = kdev->tail = 0; keychord = keychords; + end = (struct input_keychord *)((char *)keychord + count); - while (resid > 0) { - /* Is the entire keychord entry header present ? */ - if (resid < sizeof(struct input_keychord)) { - pr_err("keychord: Insufficient bytes present for header %zu\n", - resid); - goto err_unlock_return; - } - resid -= sizeof(struct input_keychord); - if (keychord->count <= 0) { + while (keychord < end) { + next = NEXT_KEYCHORD(keychord); + if (keychord->count <= 0 || next > end) { pr_err("keychord: invalid keycode count %d\n", keychord->count); goto err_unlock_return; } - key_bytes = keychord->count * sizeof(keychord->keycodes[0]); - /* Do we have all the expected keycodes ? */ - if (resid < key_bytes) { - pr_err("keychord: Insufficient bytes present for keycount %zu\n", - resid); - goto err_unlock_return; - } - resid -= key_bytes; - if (keychord->version != KEYCHORD_VERSION) { pr_err("keychord: unsupported version %d\n", keychord->version); @@ -308,7 +292,7 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer, } kdev->keychord_count++; - keychord = NEXT_KEYCHORD(keychord); + keychord = next; } kdev->keychords = keychords; From 8f5899111b4ac4603de00cb36f29284b1f42b01f Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Wed, 9 May 2018 12:24:54 -0700 Subject: [PATCH 133/134] Revert "ANDROID: input: keychord: log when keychord triggered" This reverts commit 4017c45e8bc6f34710fd3727c5bc18184b26879c. Remove keychord driver, replaced in user space by https://android-review.googlesource.com/c/677629. Signed-off-by: Mark Salyzyn Cc: JP Abgrall Cc: Amit Pundir Bug: 64114943 Bug: 129556081 Change-Id: I6db729ae86ea9d01e2f2266c5572a4fcafcbb325 --- drivers/input/misc/keychord.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c index 8ed8a2738ea1..3a2cd9ca9a31 100644 --- a/drivers/input/misc/keychord.c +++ b/drivers/input/misc/keychord.c @@ -126,12 +126,8 @@ static void keychord_event(struct input_handle *handle, unsigned int type, done: spin_unlock_irqrestore(&kdev->lock, flags); - if (got_chord) { - pr_info("keychord: got keychord id %d. Any tasks: %d\n", - keychord->id, - !list_empty_careful(&kdev->waitq.head)); + if (got_chord) wake_up_interruptible(&kdev->waitq); - } } static int keychord_connect(struct input_handler *handler, From 7401173366e67d861d496e4843d387e2c76ee351 Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Wed, 9 May 2018 12:26:22 -0700 Subject: [PATCH 134/134] Revert "ANDROID: input: keychord: Add keychord driver" This reverts commit 08a0437bf8d9085dd063171b5b37aae89fa42470. Remove keychord driver, replaced in user space by https://android-review.googlesource.com/c/677629. Signed-off-by: Mark Salyzyn Cc: Mike Lockwood Cc: Amit Pundir Bug: 64114943 Bug: 129556081 Change-Id: I6afdb551f273b6d0e25bf4b23cd8b88e39fbe47f --- drivers/input/misc/Kconfig | 11 - drivers/input/misc/Makefile | 1 - drivers/input/misc/keychord.c | 387 ---------------------------------- include/linux/keychord.h | 23 -- include/uapi/linux/keychord.h | 52 ----- 5 files changed, 474 deletions(-) delete mode 100644 drivers/input/misc/keychord.c delete mode 100644 include/linux/keychord.h delete mode 100644 include/uapi/linux/keychord.h diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig index 4b269b332636..3ad985ed8cd6 100644 --- a/drivers/input/misc/Kconfig +++ b/drivers/input/misc/Kconfig @@ -367,17 +367,6 @@ config INPUT_ATI_REMOTE2 To compile this driver as a module, choose M here: the module will be called ati_remote2. -config INPUT_KEYCHORD - tristate "Key chord input driver support" - help - Say Y here if you want to enable the key chord driver - accessible at /dev/keychord. This driver can be used - for receiving notifications when client specified key - combinations are pressed. - - To compile this driver as a module, choose M here: the - module will be called keychord. - config INPUT_KEYSPAN_REMOTE tristate "Keyspan DMR USB remote control" depends on USB_ARCH_HAS_HCD diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile index 2ecb6869e4b6..8da2b882c134 100644 --- a/drivers/input/misc/Makefile +++ b/drivers/input/misc/Makefile @@ -43,7 +43,6 @@ obj-$(CONFIG_INPUT_HISI_POWERKEY) += hisi_powerkey.o obj-$(CONFIG_HP_SDC_RTC) += hp_sdc_rtc.o obj-$(CONFIG_INPUT_IMS_PCU) += ims-pcu.o obj-$(CONFIG_INPUT_IXP4XX_BEEPER) += ixp4xx-beeper.o -obj-$(CONFIG_INPUT_KEYCHORD) += keychord.o obj-$(CONFIG_INPUT_KEYSPAN_REMOTE) += keyspan_remote.o obj-$(CONFIG_INPUT_KXTJ9) += kxtj9.o obj-$(CONFIG_INPUT_M68K_BEEP) += m68kspkr.o diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c deleted file mode 100644 index 3a2cd9ca9a31..000000000000 --- a/drivers/input/misc/keychord.c +++ /dev/null @@ -1,387 +0,0 @@ -/* - * drivers/input/misc/keychord.c - * - * Copyright (C) 2008 Google, Inc. - * Author: Mike Lockwood - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define KEYCHORD_NAME "keychord" -#define BUFFER_SIZE 16 - -MODULE_AUTHOR("Mike Lockwood "); -MODULE_DESCRIPTION("Key chord input driver"); -MODULE_SUPPORTED_DEVICE("keychord"); -MODULE_LICENSE("GPL"); - -#define NEXT_KEYCHORD(kc) ((struct input_keychord *) \ - ((char *)kc + sizeof(struct input_keychord) + \ - kc->count * sizeof(kc->keycodes[0]))) - -struct keychord_device { - struct input_handler input_handler; - int registered; - - /* list of keychords to monitor */ - struct input_keychord *keychords; - int keychord_count; - - /* bitmask of keys contained in our keychords */ - unsigned long keybit[BITS_TO_LONGS(KEY_CNT)]; - /* current state of the keys */ - unsigned long keystate[BITS_TO_LONGS(KEY_CNT)]; - /* number of keys that are currently pressed */ - int key_down; - - /* second input_device_id is needed for null termination */ - struct input_device_id device_ids[2]; - - spinlock_t lock; - wait_queue_head_t waitq; - unsigned char head; - unsigned char tail; - __u16 buff[BUFFER_SIZE]; -}; - -static int check_keychord(struct keychord_device *kdev, - struct input_keychord *keychord) -{ - int i; - - if (keychord->count != kdev->key_down) - return 0; - - for (i = 0; i < keychord->count; i++) { - if (!test_bit(keychord->keycodes[i], kdev->keystate)) - return 0; - } - - /* we have a match */ - return 1; -} - -static void keychord_event(struct input_handle *handle, unsigned int type, - unsigned int code, int value) -{ - struct keychord_device *kdev = handle->private; - struct input_keychord *keychord; - unsigned long flags; - int i, got_chord = 0; - - if (type != EV_KEY || code >= KEY_MAX) - return; - - spin_lock_irqsave(&kdev->lock, flags); - /* do nothing if key state did not change */ - if (!test_bit(code, kdev->keystate) == !value) - goto done; - __change_bit(code, kdev->keystate); - if (value) - kdev->key_down++; - else - kdev->key_down--; - - /* don't notify on key up */ - if (!value) - goto done; - /* ignore this event if it is not one of the keys we are monitoring */ - if (!test_bit(code, kdev->keybit)) - goto done; - - keychord = kdev->keychords; - if (!keychord) - goto done; - - /* check to see if the keyboard state matches any keychords */ - for (i = 0; i < kdev->keychord_count; i++) { - if (check_keychord(kdev, keychord)) { - kdev->buff[kdev->head] = keychord->id; - kdev->head = (kdev->head + 1) % BUFFER_SIZE; - got_chord = 1; - break; - } - /* skip to next keychord */ - keychord = NEXT_KEYCHORD(keychord); - } - -done: - spin_unlock_irqrestore(&kdev->lock, flags); - - if (got_chord) - wake_up_interruptible(&kdev->waitq); -} - -static int keychord_connect(struct input_handler *handler, - struct input_dev *dev, - const struct input_device_id *id) -{ - int i, ret; - struct input_handle *handle; - struct keychord_device *kdev = - container_of(handler, struct keychord_device, input_handler); - - /* - * ignore this input device if it does not contain any keycodes - * that we are monitoring - */ - for (i = 0; i < KEY_MAX; i++) { - if (test_bit(i, kdev->keybit) && test_bit(i, dev->keybit)) - break; - } - if (i == KEY_MAX) - return -ENODEV; - - handle = kzalloc(sizeof(*handle), GFP_KERNEL); - if (!handle) - return -ENOMEM; - - handle->dev = dev; - handle->handler = handler; - handle->name = KEYCHORD_NAME; - handle->private = kdev; - - ret = input_register_handle(handle); - if (ret) - goto err_input_register_handle; - - ret = input_open_device(handle); - if (ret) - goto err_input_open_device; - - pr_info("keychord: using input dev %s for fevent\n", dev->name); - - return 0; - -err_input_open_device: - input_unregister_handle(handle); -err_input_register_handle: - kfree(handle); - return ret; -} - -static void keychord_disconnect(struct input_handle *handle) -{ - input_close_device(handle); - input_unregister_handle(handle); - kfree(handle); -} - -/* - * keychord_read is used to read keychord events from the driver - */ -static ssize_t keychord_read(struct file *file, char __user *buffer, - size_t count, loff_t *ppos) -{ - struct keychord_device *kdev = file->private_data; - __u16 id; - int retval; - unsigned long flags; - - if (count < sizeof(id)) - return -EINVAL; - count = sizeof(id); - - if (kdev->head == kdev->tail && (file->f_flags & O_NONBLOCK)) - return -EAGAIN; - - retval = wait_event_interruptible(kdev->waitq, - kdev->head != kdev->tail); - if (retval) - return retval; - - spin_lock_irqsave(&kdev->lock, flags); - /* pop a keychord ID off the queue */ - id = kdev->buff[kdev->tail]; - kdev->tail = (kdev->tail + 1) % BUFFER_SIZE; - spin_unlock_irqrestore(&kdev->lock, flags); - - if (copy_to_user(buffer, &id, count)) - return -EFAULT; - - return count; -} - -/* - * keychord_write is used to configure the driver - */ -static ssize_t keychord_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - struct keychord_device *kdev = file->private_data; - struct input_keychord *keychords = 0; - struct input_keychord *keychord, *next, *end; - int ret, i, key; - unsigned long flags; - - if (count < sizeof(struct input_keychord) || count > PAGE_SIZE) - return -EINVAL; - keychords = kzalloc(count, GFP_KERNEL); - if (!keychords) - return -ENOMEM; - - /* read list of keychords from userspace */ - if (copy_from_user(keychords, buffer, count)) { - kfree(keychords); - return -EFAULT; - } - - /* unregister handler before changing configuration */ - if (kdev->registered) { - input_unregister_handler(&kdev->input_handler); - kdev->registered = 0; - } - - spin_lock_irqsave(&kdev->lock, flags); - /* clear any existing configuration */ - kfree(kdev->keychords); - kdev->keychords = 0; - kdev->keychord_count = 0; - kdev->key_down = 0; - memset(kdev->keybit, 0, sizeof(kdev->keybit)); - memset(kdev->keystate, 0, sizeof(kdev->keystate)); - kdev->head = kdev->tail = 0; - - keychord = keychords; - end = (struct input_keychord *)((char *)keychord + count); - - while (keychord < end) { - next = NEXT_KEYCHORD(keychord); - if (keychord->count <= 0 || next > end) { - pr_err("keychord: invalid keycode count %d\n", - keychord->count); - goto err_unlock_return; - } - if (keychord->version != KEYCHORD_VERSION) { - pr_err("keychord: unsupported version %d\n", - keychord->version); - goto err_unlock_return; - } - - /* keep track of the keys we are monitoring in keybit */ - for (i = 0; i < keychord->count; i++) { - key = keychord->keycodes[i]; - if (key < 0 || key >= KEY_CNT) { - pr_err("keychord: keycode %d out of range\n", - key); - goto err_unlock_return; - } - __set_bit(key, kdev->keybit); - } - - kdev->keychord_count++; - keychord = next; - } - - kdev->keychords = keychords; - spin_unlock_irqrestore(&kdev->lock, flags); - - ret = input_register_handler(&kdev->input_handler); - if (ret) { - kfree(keychords); - kdev->keychords = 0; - return ret; - } - kdev->registered = 1; - - return count; - -err_unlock_return: - spin_unlock_irqrestore(&kdev->lock, flags); - kfree(keychords); - return -EINVAL; -} - -static unsigned int keychord_poll(struct file *file, poll_table *wait) -{ - struct keychord_device *kdev = file->private_data; - - poll_wait(file, &kdev->waitq, wait); - - if (kdev->head != kdev->tail) - return POLLIN | POLLRDNORM; - - return 0; -} - -static int keychord_open(struct inode *inode, struct file *file) -{ - struct keychord_device *kdev; - - kdev = kzalloc(sizeof(struct keychord_device), GFP_KERNEL); - if (!kdev) - return -ENOMEM; - - spin_lock_init(&kdev->lock); - init_waitqueue_head(&kdev->waitq); - - kdev->input_handler.event = keychord_event; - kdev->input_handler.connect = keychord_connect; - kdev->input_handler.disconnect = keychord_disconnect; - kdev->input_handler.name = KEYCHORD_NAME; - kdev->input_handler.id_table = kdev->device_ids; - - kdev->device_ids[0].flags = INPUT_DEVICE_ID_MATCH_EVBIT; - __set_bit(EV_KEY, kdev->device_ids[0].evbit); - - file->private_data = kdev; - - return 0; -} - -static int keychord_release(struct inode *inode, struct file *file) -{ - struct keychord_device *kdev = file->private_data; - - if (kdev->registered) - input_unregister_handler(&kdev->input_handler); - kfree(kdev); - - return 0; -} - -static const struct file_operations keychord_fops = { - .owner = THIS_MODULE, - .open = keychord_open, - .release = keychord_release, - .read = keychord_read, - .write = keychord_write, - .poll = keychord_poll, -}; - -static struct miscdevice keychord_misc = { - .fops = &keychord_fops, - .name = KEYCHORD_NAME, - .minor = MISC_DYNAMIC_MINOR, -}; - -static int __init keychord_init(void) -{ - return misc_register(&keychord_misc); -} - -static void __exit keychord_exit(void) -{ - misc_deregister(&keychord_misc); -} - -module_init(keychord_init); -module_exit(keychord_exit); diff --git a/include/linux/keychord.h b/include/linux/keychord.h deleted file mode 100644 index 08cf5402102c..000000000000 --- a/include/linux/keychord.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Key chord input driver - * - * Copyright (C) 2008 Google, Inc. - * Author: Mike Lockwood - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * -*/ - -#ifndef __LINUX_KEYCHORD_H_ -#define __LINUX_KEYCHORD_H_ - -#include - -#endif /* __LINUX_KEYCHORD_H_ */ diff --git a/include/uapi/linux/keychord.h b/include/uapi/linux/keychord.h deleted file mode 100644 index ea7cf4d27bbd..000000000000 --- a/include/uapi/linux/keychord.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Key chord input driver - * - * Copyright (C) 2008 Google, Inc. - * Author: Mike Lockwood - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * -*/ - -#ifndef _UAPI_LINUX_KEYCHORD_H_ -#define _UAPI_LINUX_KEYCHORD_H_ - -#include - -#define KEYCHORD_VERSION 1 - -/* - * One or more input_keychord structs are written to /dev/keychord - * at once to specify the list of keychords to monitor. - * Reading /dev/keychord returns the id of a keychord when the - * keychord combination is pressed. A keychord is signalled when - * all of the keys in the keycode list are in the pressed state. - * The order in which the keys are pressed does not matter. - * The keychord will not be signalled if keys not in the keycode - * list are pressed. - * Keychords will not be signalled on key release events. - */ -struct input_keychord { - /* should be KEYCHORD_VERSION */ - __u16 version; - /* - * client specified ID, returned from read() - * when this keychord is pressed. - */ - __u16 id; - - /* number of keycodes in this keychord */ - __u16 count; - - /* variable length array of keycodes */ - __u16 keycodes[]; -}; - -#endif /* _UAPI_LINUX_KEYCHORD_H_ */