Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
filesystems
lustre_2_15
0015-LU-15544-ldiskfs-SUSE-15-SP4-kernel-5.14.2...
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File 0015-LU-15544-ldiskfs-SUSE-15-SP4-kernel-5.14.21-SUSE.patch of Package lustre_2_15
From 9e837fc5b7a4d6dd26f149de6ee77c625f6740b2 Mon Sep 17 00:00:00 2001 From: Shaun Tancheff <shaun.tancheff@hpe.com> Date: Tue, 15 Nov 2022 06:03:21 -0600 Subject: [PATCH 15/30] LU-15544 ldiskfs: SUSE 15 SP4 kernel 5.14.21 SUSE Updated patch series for SUSE 15 SP4 kernel 5.14.21 based on 5.10 Linux commit v5.14-rc2-19-g188c299e2a26 ext4: Support for checksumming from journal triggers Results in ext4_journal_get_write_access() having 4 arguments. This change provides a compat wrapper for older kernels. Linux commit v5.12-rc4-7-g471fbbea7ff7 ext4: handle casefolding with encryption This change impacts directory entry hash calculation and impacts EXT4_DIR_REC_LEN and EXT4_DIR_ENTRY_LEN macros which now requires the inode parent dir's inode. Similarly ext4fs_dirhash() also takes the inode parent dir's inode. This changes provides a compat wrapper for ext4fs_dirhash to support older kernels. Patches dropped due to upstream ext4 landings: linux-5.9/ext4-simple-blockalloc.patch base/ext4-projid-xattrs.patch linux-5.8/ext4-enc-flag.patch Test-Parameters: trivial HPE-bug-id: LUS-10744 Signed-off-by: Shaun Tancheff <shaun.tancheff@hpe.com> Change-Id: Ic50227eaa231e2f1e98f4a7c9e5838e3303cbdf6 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/46504 Tested-by: jenkins <devops@whamcloud.com> Tested-by: Maloo <maloo@whamcloud.com> Reviewed-by: Oleg Drokin <green@whamcloud.com> Reviewed-by: Andreas Dilger <adilger@whamcloud.com> Reviewed-by: Jian Yu <yujian@whamcloud.com> --- config/lustre-build-ldiskfs.m4 | 43 ldiskfs/kernel_patches/patches/linux-5.14/export-ext4fs-dirhash-helper.patch | 47 ldiskfs/kernel_patches/patches/linux-5.14/ext4-corrupted-inode-block-bitmaps-handling-patches.patch | 274 ++ ldiskfs/kernel_patches/patches/linux-5.14/ext4-export-mb-stream-allocator-variables.patch | 106 + ldiskfs/kernel_patches/patches/linux-5.14/ext4-ialloc-uid-gid-and-pass-owner-down.patch | 100 + ldiskfs/kernel_patches/patches/linux-5.14/ext4-kill-dx-root.patch | 260 ++ ldiskfs/kernel_patches/patches/linux-5.14/ext4-mballoc-extra-checks.patch | 313 +++ ldiskfs/kernel_patches/patches/linux-5.14/ext4-prealloc.patch | 400 ++++ ldiskfs/kernel_patches/patches/linux-5.14/ext4-print-inum-in-htree-warning.patch | 23 ldiskfs/kernel_patches/patches/linux-5.14/ext4-projid-xattrs.patch | 167 + ldiskfs/kernel_patches/patches/linux-5.14/ext4-xattr-disable-credits-check.patch | 24 ldiskfs/kernel_patches/patches/sles15sp4/ext4-data-in-dirent.patch | 884 +++++++++ ldiskfs/kernel_patches/patches/sles15sp4/ext4-hash-indexed-dir-dotdot-update.patch | 98 + ldiskfs/kernel_patches/patches/sles15sp4/ext4-misc.patch | 226 ++ ldiskfs/kernel_patches/patches/sles15sp4/ext4-pdirop.patch | 931 ++++++++++ ldiskfs/kernel_patches/series/ldiskfs-5.14.21-sles15sp4.series | 30 lustre/osd-ldiskfs/osd_compat.c | 3 lustre/osd-ldiskfs/osd_handler.c | 28 lustre/osd-ldiskfs/osd_iam.c | 27 lustre/osd-ldiskfs/osd_iam_lvar.c | 57 lustre/osd-ldiskfs/osd_internal.h | 11 lustre/osd-ldiskfs/osd_io.c | 4 22 files changed, 4008 insertions(+), 48 deletions(-) create mode 100644 ldiskfs/kernel_patches/patches/linux-5.14/export-ext4fs-dirhash-helper.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-5.14/ext4-corrupted-inode-block-bitmaps-handling-patches.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-5.14/ext4-export-mb-stream-allocator-variables.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-5.14/ext4-ialloc-uid-gid-and-pass-owner-down.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-5.14/ext4-kill-dx-root.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-5.14/ext4-mballoc-extra-checks.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-5.14/ext4-prealloc.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-5.14/ext4-print-inum-in-htree-warning.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-5.14/ext4-projid-xattrs.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-5.14/ext4-xattr-disable-credits-check.patch create mode 100644 ldiskfs/kernel_patches/patches/sles15sp4/ext4-data-in-dirent.patch create mode 100644 ldiskfs/kernel_patches/patches/sles15sp4/ext4-hash-indexed-dir-dotdot-update.patch create mode 100644 ldiskfs/kernel_patches/patches/sles15sp4/ext4-misc.patch create mode 100644 ldiskfs/kernel_patches/patches/sles15sp4/ext4-pdirop.patch create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-5.14.21-sles15sp4.series --- a/config/lustre-build-ldiskfs.m4 +++ b/config/lustre-build-ldiskfs.m4 @@ -68,6 +68,8 @@ AS_IF([test x$RHEL_KERNEL = xyes], [ ;; # ( 15sp3 ) LDISKFS_SERIES="5.3.18-sles15sp3.series" ;; + 15sp4 ) LDISKFS_SERIES="5.14.21-sles15sp4.series" + ;; esac ]) ], [test x$UBUNTU_KERNEL = xyes], [ @@ -423,7 +425,7 @@ ext4fs_dirhash, [ int f = ext4fs_dirhash(NULL, NULL, 0, NULL); (void)f; ],[ - AC_DEFINE(HAVE_LDISKFSFS_GETHASH_INODE_ARG, 1, + AC_DEFINE(HAVE_LDISKFSFS_DIRHASH_WITH_DIR, 1, [ldiskfsfs_dirhash takes an inode argument]) ]) EXTRA_KCFLAGS="$tmp_flags" @@ -491,6 +493,44 @@ EXTRA_KCFLAGS="$tmp_flags" ]) # LB_JBD2_JOURNAL_GET_MAX_TXN_BUFS # +# LB_EXT4_JOURNAL_GET_WRITE_ACCESS_4A +# +# Linux v5.14-rc2-19-g188c299e2a26 +# ext4: Support for checksumming from journal triggers +# +AC_DEFUN([LB_EXT4_JOURNAL_GET_WRITE_ACCESS_4A], [ +tmp_flags="$EXTRA_KCFLAGS" +EXTRA_KCFLAGS="-Werror" +LB_CHECK_COMPILE([if jbd2_journal_get_max_txn_bufs is available], +ext4_journal_get_write_access, [ + #include <linux/fs.h> + #include "$EXT4_SRC_DIR/ext4.h" + #include "$EXT4_SRC_DIR/ext4_jbd2.h" + + int __ext4_journal_get_write_access(const char *where, unsigned int line, + handle_t *handle, + struct super_block *sb, + struct buffer_head *bh, + enum ext4_journal_trigger_type trigger_type) + { + return 0; + } +],[ + handle_t *handle = NULL; + struct super_block *sb = NULL; + struct buffer_head *bh = NULL; + enum ext4_journal_trigger_type trigger_type = EXT4_JTR_NONE; + int err = ext4_journal_get_write_access(handle, sb, bh, trigger_type); + + (void)err; +],[ + AC_DEFINE(HAVE_EXT4_JOURNAL_GET_WRITE_ACCESS_4ARGS, 1, + [ext4_journal_get_write_access() has 4 arguments]) +]) +EXTRA_KCFLAGS="$tmp_flags" +]) # LB_EXT4_JOURNAL_GET_WRITE_ACCESS_4A + +# # LB_CONFIG_LDISKFS # AC_DEFUN([LB_CONFIG_LDISKFS], [ @@ -545,6 +585,7 @@ AS_IF([test x$enable_ldiskfs != xno],[ LB_JBD2_H_TOTAL_CREDITS LB_EXT4_INC_DEC_COUNT_2ARGS LB_JBD2_JOURNAL_GET_MAX_TXN_BUFS + LB_EXT4_JOURNAL_GET_WRITE_ACCESS_4A AC_DEFINE(CONFIG_LDISKFS_FS_POSIX_ACL, 1, [posix acls for ldiskfs]) AC_DEFINE(CONFIG_LDISKFS_FS_SECURITY, 1, [fs security for ldiskfs]) AC_DEFINE(CONFIG_LDISKFS_FS_XATTR, 1, [extened attributes for ldiskfs]) --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-5.14/export-ext4fs-dirhash-helper.patch @@ -0,0 +1,47 @@ +Subject: [PATCH] linux-5.14/export-ext4fs-dirhash-helper + +--- + fs/ext4/ext4.h | 3 +++ + fs/ext4/hash.c | 5 +++-- + 2 files changed, 6 insertions(+), 2 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 083c109..09d8720 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -2996,6 +2996,9 @@ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); + extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, + struct dx_hash_info *hinfo); + ++extern int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, ++ struct dx_hash_info *hinfo); ++ + /* ialloc.c */ + extern int ext4_mark_inode_used(struct super_block *sb, int ino); + extern struct inode *__ext4_new_inode(struct user_namespace *, handle_t *, +diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c +index f34f417..a41bd48 100644 +--- a/fs/ext4/hash.c ++++ b/fs/ext4/hash.c +@@ -197,8 +197,8 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) + * represented, and whether or not the returned hash is 32 bits or 64 + * bits. 32 bit hashes will return 0 for the minor hash. + */ +-static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, +- struct dx_hash_info *hinfo) ++int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, ++ struct dx_hash_info *hinfo) + { + __u32 hash; + __u32 minor_hash = 0; +@@ -286,6 +286,7 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, + hinfo->minor_hash = minor_hash; + return 0; + } ++EXPORT_SYMBOL(__ext4fs_dirhash); + + int ext4fs_dirhash(const struct inode *dir, const char *name, int len, + struct dx_hash_info *hinfo) +-- +2.31.1 + --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-5.14/ext4-corrupted-inode-block-bitmaps-handling-patches.patch @@ -0,0 +1,274 @@ +Subject: [PATCH] ext4-corrupted-inode-block-bitmaps-handling-patches + +Since we could skip corrupt block groups, this patch +use ext4_warning() intead of ext4_error() to make FS not +remount RO in default + +--- + fs/ext4/balloc.c | 10 +++---- + fs/ext4/ialloc.c | 6 ++--- + fs/ext4/mballoc.c | 68 +++++++++++++++++++---------------------------- + 3 files changed, 35 insertions(+), 49 deletions(-) + +diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c +index 9dc6e74..ca05c72 100644 +--- a/fs/ext4/balloc.c ++++ b/fs/ext4/balloc.c +@@ -387,7 +387,7 @@ static int ext4_validate_block_bitmap(struct super_block *sb, + desc, bh) || + ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { + ext4_unlock_group(sb, block_group); +- ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); ++ ext4_warning(sb, "bg %u: bad block bitmap checksum", block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSBADCRC; +@@ -395,8 +395,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, + blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); + if (unlikely(blk != 0)) { + ext4_unlock_group(sb, block_group); +- ext4_error(sb, "bg %u: block %llu: invalid block bitmap", +- block_group, blk); ++ ext4_warning(sb, "bg %u: block %llu: invalid block bitmap", ++ block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSCORRUPTED; +@@ -479,8 +479,8 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + if (err) { +- ext4_error(sb, "Failed to init block bitmap for group " +- "%u: %d", block_group, err); ++ ext4_warning(sb, "Failed to init block bitmap for group " ++ "%u: %d", block_group, err); + goto out; + } + goto verify; +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index 4840190..f73d3f8 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -102,8 +102,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, + EXT4_INODES_PER_GROUP(sb) / 8) || + ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { + ext4_unlock_group(sb, block_group); +- ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " +- "inode_bitmap = %llu", block_group, blk); ++ ext4_warning(sb, "Corrupt inode bitmap - block_group = %u, " ++ "inode_bitmap = %llu", block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + return -EFSBADCRC; +@@ -353,7 +353,7 @@ out: + if (!fatal) + fatal = err; + } else { +- ext4_error(sb, "bit already cleared for inode %lu", ino); ++ ext4_warning(sb, "bit already cleared for inode %lu", ino); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + } +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index de49616..90dfe9c 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -1134,10 +1134,14 @@ int ext4_mb_generate_buddy(struct super_block *sb, + grp->bb_fragments = fragments; + + if (free != grp->bb_free) { +- ext4_grp_locked_error(sb, group, 0, 0, +- "block bitmap and bg descriptor " +- "inconsistent: %u vs %u free clusters", +- free, grp->bb_free); ++ struct ext4_group_desc *gdp; ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ ext4_warning(sb, "group %lu: block bitmap and bg descriptor " ++ "inconsistent: %u vs %u free clusters " ++ "%u in gd, %lu pa's", ++ (long unsigned int)group, free, grp->bb_free, ++ ext4_free_group_clusters(sb, gdp), ++ grp->bb_prealloc_nr); + /* + * If we intend to continue, we consider group descriptor + * corrupt and update bb_free using bitmap value +@@ -1481,7 +1485,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, + int block; + int pnum; + int poff; +- struct page *page; ++ struct page *page = NULL; + int ret; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); +@@ -1507,7 +1511,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, + */ + ret = ext4_mb_init_group(sb, group, gfp); + if (ret) +- return ret; ++ goto err; + } + + /* +@@ -1607,6 +1611,7 @@ err: + put_page(e4b->bd_buddy_page); + e4b->bd_buddy = NULL; + e4b->bd_bitmap = NULL; ++ ext4_warning(sb, "Error loading buddy information for %u", group); + return ret; + } + +@@ -4526,9 +4531,11 @@ int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap, + } + + if (free != free_in_gdp) { +- ext4_error(sb, "on-disk bitmap for group %d" ++ ext4_warning(sb, "on-disk bitmap for group %d" + "corrupted: %u blocks free in bitmap, %u - in gd\n", + group, free, free_in_gdp); ++ ext4_mark_group_bitmap_corrupted(sb, group, ++ EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EIO; + } + return 0; +@@ -4894,16 +4901,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, + /* "free < pa->pa_free" means we maybe double alloc the same blocks, + * otherwise maybe leave some free blocks unavailable, no need to BUG.*/ + if ((free > pa->pa_free && !pa->pa_error) || (free < pa->pa_free)) { +- ext4_error(sb, "pa free mismatch: [pa %p] " +- "[phy %lu] [logic %lu] [len %u] [free %u] " +- "[error %u] [inode %d] [freed %u]", pa, +- (unsigned long)pa->pa_pstart, +- (unsigned long)pa->pa_lstart, +- pa->pa_len, (unsigned)pa->pa_free, +- (unsigned)pa->pa_error, pa->pa_inode->i_ino, +- free); + ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", +- free, pa->pa_free); ++ free, pa->pa_free); + /* + * pa is already deleted so we use the value obtained + * from the bitmap and continue. +@@ -4963,16 +4962,11 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); +- ext4_error_err(sb, -err, +- "Error %d reading block bitmap for %u", +- err, group); + goto out_dbg; + } + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { +- ext4_warning(sb, "Error %d loading buddy information for %u", +- err, group); + put_bh(bitmap_bh); + goto out_dbg; + } +@@ -5129,17 +5123,12 @@ repeat: + + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); +- if (err) { +- ext4_error_err(sb, -err, "Error %d loading buddy information for %u", +- err, group); ++ if (err) + return; +- } + + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); +- ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", +- err, group); + ext4_mb_unload_buddy(&e4b); + continue; + } +@@ -5434,11 +5423,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, + group = ext4_get_group_number(sb, pa->pa_pstart); + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); +- if (err) { +- ext4_error_err(sb, -err, "Error %d loading buddy information for %u", +- err, group); ++ if (err) + continue; +- } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_get_group_info(sb, group)->bb_prealloc_nr--; +@@ -5773,7 +5759,7 @@ errout: + * been updated or not when fail case. So can + * not revert pa_free back, just mark pa_error*/ + pa->pa_error++; +- ext4_error(sb, ++ ext4_warning(sb, + "Updating bitmap error: [err %d] " + "[pa %p] [phy %lu] [logic %lu] " + "[len %u] [free %u] [error %u] " +@@ -5784,6 +5770,7 @@ errout: + (unsigned)pa->pa_free, + (unsigned)pa->pa_error, + pa->pa_inode ? pa->pa_inode->i_ino : 0); ++ ext4_mark_group_bitmap_corrupted(sb, 0, 0); + } + } + ext4_mb_release_context(ac); +@@ -6182,7 +6169,7 @@ do_more: + err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, + GFP_NOFS|__GFP_NOFAIL); + if (err) +- goto error_return; ++ goto error_brelse; + + /* + * We need to make sure we don't reuse the freed block until after the +@@ -6273,8 +6260,9 @@ do_more: + goto do_more; + } + error_return: +- brelse(bitmap_bh); + ext4_std_error(sb, err); ++error_brelse: ++ brelse(bitmap_bh); + return; + } + +@@ -6375,7 +6363,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) +- goto error_return; ++ goto error_brelse; + + /* + * need to update group_info->bb_free and bitmap +@@ -6414,8 +6402,9 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + err = ret; + + error_return: +- brelse(bitmap_bh); + ext4_std_error(sb, err); ++error_brelse: ++ brelse(bitmap_bh); + return err; + } + +@@ -6538,11 +6527,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, + trace_ext4_trim_all_free(sb, group, start, max); + + ret = ext4_mb_load_buddy(sb, group, &e4b); +- if (ret) { +- ext4_warning(sb, "Error %d loading buddy information for %u", +- ret, group); +- return ret; +- } ++ if (ret) ++ return ret; + ext4_lock_group(sb, group); + if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || + minblocks < atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) { +-- +2.34.1 + --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-5.14/ext4-export-mb-stream-allocator-variables.patch @@ -0,0 +1,106 @@ +--- + fs/ext4/ext4.h | 2 ++ + fs/ext4/mballoc.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++ + fs/ext4/sysfs.c | 4 ++++ + 3 files changed, 64 insertions(+) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 1e0cb82..60a780b 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -3053,6 +3053,8 @@ int __init ext4_fc_init_dentry_cache(void); + + /* mballoc.c */ + extern const struct proc_ops ext4_seq_prealloc_table_fops; ++extern const struct proc_ops ext4_seq_mb_last_group_fops; ++extern int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v); + extern const struct seq_operations ext4_mb_seq_groups_ops; + extern const struct seq_operations ext4_mb_seq_structs_summary_ops; + extern long ext4_mb_stats; +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index 3a18aff..269cf5a 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -3169,6 +3169,64 @@ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) + return cachep; + } + ++#define EXT4_MB_MAX_INPUT_STRING_SIZE 32 ++ ++static ssize_t ext4_mb_last_group_write(struct file *file, ++ const char __user *buf, ++ size_t cnt, loff_t *pos) ++{ ++ char dummy[EXT4_MB_MAX_INPUT_STRING_SIZE + 1]; ++ struct super_block *sb = PDE_DATA(file_inode(file)); ++ struct ext4_sb_info *sbi = EXT4_SB(sb); ++ unsigned long val; ++ char *end; ++ ++ if (cnt > EXT4_MB_MAX_INPUT_STRING_SIZE) ++ return -EINVAL; ++ if (copy_from_user(dummy, buf, cnt)) ++ return -EFAULT; ++ dummy[cnt] = '\0'; ++ val = simple_strtoul(dummy, &end, 0); ++ if (dummy == end) ++ return -EINVAL; ++ if (val >= ext4_get_groups_count(sb)) ++ return -ERANGE; ++ spin_lock(&sbi->s_md_lock); ++ sbi->s_mb_last_group = val; ++ sbi->s_mb_last_start = 0; ++ spin_unlock(&sbi->s_md_lock); ++ return cnt; ++} ++ ++static int ext4_mb_seq_last_group_seq_show(struct seq_file *m, void *v) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(m->private); ++ ++ seq_printf(m , "%ld\n", sbi->s_mb_last_group); ++ return 0; ++} ++ ++static int ext4_mb_seq_last_group_open(struct inode *inode, struct file *file) ++{ ++ return single_open(file, ext4_mb_seq_last_group_seq_show, PDE_DATA(inode)); ++} ++ ++const struct proc_ops ext4_seq_mb_last_group_fops = { ++ .proc_open = ext4_mb_seq_last_group_open, ++ .proc_read = seq_read, ++ .proc_lseek = seq_lseek, ++ .proc_release = seq_release, ++ .proc_write = ext4_mb_last_group_write, ++}; ++ ++int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(m->private); ++ ++ seq_printf(m , "%ld\n", sbi->s_mb_last_start); ++ return 0; ++} ++ + /* + * Allocate the top-level s_group_info array for the specified number + * of groups +diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c +index ed7a7e5..c1ef6e3 100644 +--- a/fs/ext4/sysfs.c ++++ b/fs/ext4/sysfs.c +@@ -555,6 +555,10 @@ int ext4_register_sysfs(struct super_block *sb) + &ext4_mb_seq_groups_ops, sb); + proc_create_data("prealloc_table", S_IRUGO, sbi->s_proc, + &ext4_seq_prealloc_table_fops, sb); ++ proc_create_data("mb_last_group", S_IRUGO, sbi->s_proc, ++ &ext4_seq_mb_last_group_fops, sb); ++ proc_create_single_data("mb_last_start", S_IRUGO, sbi->s_proc, ++ ext4_mb_seq_last_start_seq_show, sb); + proc_create_single_data("mb_stats", 0444, sbi->s_proc, + ext4_seq_mb_stats_show, sb); + proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc, +-- +2.31.1 + --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-5.14/ext4-ialloc-uid-gid-and-pass-owner-down.patch @@ -0,0 +1,100 @@ +Subject: [PATCH] pass inode timestamps at initial creation + +pass inode timestamps at initial creation +--- + fs/ext4/ext4.h | 8 ++++---- + fs/ext4/ialloc.c | 11 ++++++++++- + fs/ext4/namei.c | 13 +++++++++++-- + 3 files changed, 25 insertions(+), 7 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index ad9d996..8f87cea 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -3010,15 +3010,15 @@ extern struct inode *__ext4_new_inode(struct user_namespace *, handle_t *, + const struct qstr *qstr, __u32 goal, + uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, +- int nblocks); ++ int nblocks, struct iattr *iattr); + + #define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ + __ext4_new_inode(&init_user_ns, (handle), (dir), (mode), (qstr), \ +- (goal), (owner), i_flags, 0, 0, 0) ++ (goal), (owner), i_flags, 0, 0, 0, NULL) + #define ext4_new_inode_start_handle(mnt_userns, dir, mode, qstr, goal, owner, \ + type, nblocks) \ + __ext4_new_inode((mnt_userns), NULL, (dir), (mode), (qstr), (goal), (owner), \ +- 0, (type), __LINE__, (nblocks)) ++ 0, (type), __LINE__, (nblocks), NULL) + + + extern void ext4_free_inode(handle_t *, struct inode *); +@@ -3202,7 +3202,7 @@ extern int ext4_orphan_add(handle_t *, struct inode *); + extern int ext4_orphan_del(handle_t *, struct inode *); + extern struct inode *ext4_create_inode(handle_t *handle, + struct inode *dir, int mode, +- uid_t *owner); ++ struct iattr *iattr); + extern int ext4_delete_entry(handle_t *handle, struct inode * dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh); +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index f73d3f8..18375ec 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -928,7 +928,7 @@ struct inode *__ext4_new_inode(struct user_namespace *mnt_userns, + umode_t mode, const struct qstr *qstr, + __u32 goal, uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, +- int nblocks) ++ int nblocks, struct iattr *iattr) + { + struct super_block *sb; + struct buffer_head *inode_bitmap_bh = NULL; +@@ -1308,6 +1308,15 @@ got: + if (err) + goto fail_drop; + ++ if (iattr) { ++ if (iattr->ia_valid & ATTR_CTIME) ++ inode->i_ctime = iattr->ia_ctime; ++ if (iattr->ia_valid & ATTR_MTIME) ++ inode->i_mtime = iattr->ia_mtime; ++ if (iattr->ia_valid & ATTR_ATIME) ++ inode->i_atime = iattr->ia_atime; ++ } ++ + /* + * Since the encryption xattr will always be unique, create it first so + * that it's less likely to end up in an external xattr block and +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index 6798499..38da4fe 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -3336,11 +3336,20 @@ static int ext4_add_nondir(handle_t *handle, + /* Return locked inode, then the caller can modify the inode's states/flags + * before others finding it. The caller should unlock the inode by itself. */ + struct inode *ext4_create_inode(handle_t *handle, struct inode *dir, int mode, +- uid_t *owner) ++ struct iattr *iattr) + { + struct inode *inode; ++ uid_t owner[2] = {0, 0}; + +- inode = ext4_new_inode(handle, dir, mode, NULL, 0, owner, 0); ++ if (iattr) { ++ if (iattr->ia_valid & ATTR_UID) ++ owner[0] = from_kuid(&init_user_ns, iattr->ia_uid); ++ if (iattr->ia_valid & ATTR_GID) ++ owner[1] = from_kgid(&init_user_ns, iattr->ia_gid); ++ } ++ ++ inode = __ext4_new_inode(&init_user_ns, handle, dir, mode, NULL, 0, owner, 0, ++ 0, 0, 0, iattr); + if (!IS_ERR(inode)) { + if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode)) { + #ifdef CONFIG_LDISKFS_FS_XATTR +-- +2.31.1 + --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-5.14/ext4-kill-dx-root.patch @@ -0,0 +1,260 @@ +Subject: [PATCH] linux-5.14/ext4-kill-dx-root.patch + +--- + fs/ext4/namei.c | 119 +++++++++++++++++++++++++----------------------- + 1 file changed, 61 insertions(+), 58 deletions(-) + +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index e4514c9..92d29fe 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -216,22 +216,13 @@ struct dx_entry + * hash version mod 4 should never be 0. Sincerely, the paranoia department. + */ + +-struct dx_root ++struct dx_root_info + { +- struct fake_dirent dot; +- char dot_name[4]; +- struct fake_dirent dotdot; +- char dotdot_name[4]; +- struct dx_root_info +- { +- __le32 reserved_zero; +- u8 hash_version; +- u8 info_length; /* 8 */ +- u8 indirect_levels; +- u8 unused_flags; +- } +- info; +- struct dx_entry entries[]; ++ __le32 reserved_zero; ++ u8 hash_version; ++ u8 info_length; /* 8 */ ++ u8 indirect_levels; ++ u8 unused_flags; + }; + + struct dx_node +@@ -535,6 +526,16 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ ++struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de) ++{ ++ /* get dotdot first */ ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1)); ++ ++ /* dx root info is after dotdot entry */ ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2)); ++ ++ return (struct dx_root_info *)de; ++} + + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) + { +@@ -780,7 +781,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + { + unsigned count, indirect; + struct dx_entry *at, *entries, *p, *q, *m; +- struct dx_root *root; ++ struct dx_root_info *info; + struct dx_frame *frame = frame_in; + struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); + u32 hash; +@@ -790,24 +791,24 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + if (IS_ERR(frame->bh)) + return (struct dx_frame *) frame->bh; + +- root = (struct dx_root *) frame->bh->b_data; +- if (root->info.hash_version != DX_HASH_TEA && +- root->info.hash_version != DX_HASH_HALF_MD4 && +- root->info.hash_version != DX_HASH_LEGACY && +- root->info.hash_version != DX_HASH_SIPHASH) { +- ext4_warning_inode(dir, +- "Unrecognised inode hash code %u for directory %lu", +- root->info.hash_version, dir->i_ino); ++ info = dx_get_dx_info((struct ext4_dir_entry_2 *)frame->bh->b_data); ++ if (info->hash_version != DX_HASH_TEA && ++ info->hash_version != DX_HASH_HALF_MD4 && ++ info->hash_version != DX_HASH_LEGACY && ++ info->hash_version != DX_HASH_SIPHASH) { ++ ext4_warning(dir->i_sb, ++ "Unrecognised inode hash code %d for directory #%lu", ++ info->hash_version, dir->i_ino); + goto fail; + } + if (ext4_hash_in_dirent(dir)) { +- if (root->info.hash_version != DX_HASH_SIPHASH) { ++ if (info->hash_version != DX_HASH_SIPHASH) { + ext4_warning_inode(dir, + "Hash in dirent, but hash is not SIPHASH"); + goto fail; + } + } else { +- if (root->info.hash_version == DX_HASH_SIPHASH) { ++ if (info->hash_version == DX_HASH_SIPHASH) { + ext4_warning_inode(dir, + "Hash code is SIPHASH, but hash not in dirent"); + goto fail; +@@ -815,7 +816,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + } + if (fname) + hinfo = &fname->hinfo; +- hinfo->hash_version = root->info.hash_version; ++ hinfo->hash_version = info->hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; +@@ -825,13 +826,13 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo); + hash = hinfo->hash; + +- if (root->info.unused_flags & 1) { ++ if (info->unused_flags & 1) { + ext4_warning_inode(dir, "Unimplemented hash flags: %#06x", +- root->info.unused_flags); ++ info->unused_flags); + goto fail; + } + +- indirect = root->info.indirect_levels; ++ indirect = info->indirect_levels; + if (indirect >= ext4_dir_htree_level(dir->i_sb)) { + ext4_warning(dir->i_sb, + "Directory (ino: %lu) htree depth %#06x exceed" +@@ -844,14 +845,13 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + goto fail; + } + +- entries = (struct dx_entry *)(((char *)&root->info) + +- root->info.info_length); ++ entries = (struct dx_entry *)(((char *)info) + info->info_length); + + if (dx_get_limit(entries) != dx_root_limit(dir, +- root->info.info_length)) { ++ info->info_length)) { + ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", + dx_get_limit(entries), +- dx_root_limit(dir, root->info.info_length)); ++ dx_root_limit(dir, info->info_length)); + goto fail; + } + +@@ -923,7 +923,7 @@ static void dx_release(struct dx_frame *frames) + if (frames[0].bh == NULL) + return; + +- info = &((struct dx_root *)frames[0].bh->b_data)->info; ++ info = dx_get_dx_info((struct ext4_dir_entry_2 *)frames[0].bh->b_data); + /* save local copy, "info" may be freed after brelse() */ + indirect_levels = info->indirect_levels; + for (i = 0; i <= indirect_levels; i++) { +@@ -2181,16 +2181,15 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + struct inode *inode, struct buffer_head *bh) + { + struct buffer_head *bh2; +- struct dx_root *root; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries; +- struct ext4_dir_entry_2 *de, *de2; ++ struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de; + char *data2, *top; + unsigned len; + int retval; + unsigned blocksize; + ext4_lblk_t block; +- struct fake_dirent *fde; ++ struct dx_root_info *dx_info; + int csum_size = 0; + + if (ext4_has_metadata_csum(inode->i_sb)) +@@ -2206,18 +2205,19 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + brelse(bh); + return retval; + } +- root = (struct dx_root *) bh->b_data; ++ ++ dot_de = (struct ext4_dir_entry_2 *)bh->b_data; ++ dotdot_de = ext4_next_entry(dot_de, blocksize); + + /* The 0th block becomes the root, move the dirents out */ +- fde = &root->dotdot; +- de = (struct ext4_dir_entry_2 *)((char *)fde + +- ext4_rec_len_from_disk(fde->rec_len, blocksize)); +- if ((char *) de >= (((char *) root) + blocksize)) { ++ de = (struct ext4_dir_entry_2 *)((char *)dotdot_de + ++ ext4_rec_len_from_disk(dotdot_de->rec_len, blocksize)); ++ if ((char *)de >= (((char *)dot_de) + blocksize)) { + EXT4_ERROR_INODE(dir, "invalid rec_len for '..'"); + brelse(bh); + return -EFSCORRUPTED; + } +- len = ((char *) root) + (blocksize - csum_size) - (char *) de; ++ len = ((char *)dot_de) + (blocksize - csum_size) - (char *)de; + + /* Allocate new block for the 0th block's dirents */ + bh2 = ext4_append(handle, dir, &block); +@@ -2241,24 +2241,26 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + ext4_initialize_dirent_tail(bh2, blocksize); + + /* Initialize the root; the dot dirents already exist */ +- de = (struct ext4_dir_entry_2 *) (&root->dotdot); +- de->rec_len = ext4_rec_len_to_disk( +- blocksize - ext4_dir_rec_len(2, NULL), blocksize); +- memset (&root->info, 0, sizeof(root->info)); +- root->info.info_length = sizeof(root->info); ++ dotdot_de->rec_len = ++ ext4_rec_len_to_disk(blocksize - le16_to_cpu(dot_de->rec_len), ++ blocksize); ++ ++ /* initialize hashing info */ ++ dx_info = dx_get_dx_info(dot_de); ++ memset(dx_info, 0, sizeof(*dx_info)); ++ dx_info->info_length = sizeof(*dx_info); + if (ext4_hash_in_dirent(dir)) +- root->info.hash_version = DX_HASH_SIPHASH; ++ dx_info->hash_version = DX_HASH_SIPHASH; + else +- root->info.hash_version = ++ dx_info->hash_version = + EXT4_SB(dir->i_sb)->s_def_hash_version; +- +- entries = root->entries; ++ entries = (void *)dx_info + sizeof(*dx_info); + dx_set_block(entries, 1); + dx_set_count(entries, 1); +- dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); ++ dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info))); + + /* Initialize as for dx_probe */ +- fname->hinfo.hash_version = root->info.hash_version; ++ fname->hinfo.hash_version = dx_info->hash_version; + if (fname->hinfo.hash_version <= DX_HASH_TEA) + fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; +@@ -2626,7 +2628,7 @@ again: + if (restart || err) + goto journal_error; + } else { +- struct dx_root *dxroot; ++ struct dx_root_info *info; + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); +@@ -2634,8 +2636,9 @@ again: + /* Set up root */ + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); +- dxroot = (struct dx_root *)frames[0].bh->b_data; +- dxroot->info.indirect_levels += 1; ++ info = dx_get_dx_info((struct ext4_dir_entry_2 *) ++ frames[0].bh->b_data); ++ info->indirect_levels = 1; + dxtrace(printk(KERN_DEBUG + "Creating %d level index...\n", + dxroot->info.indirect_levels)); +-- +2.31.1 + --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-5.14/ext4-mballoc-extra-checks.patch @@ -0,0 +1,313 @@ +Subject: [PATCH] linux-5.14/ext4-mballoc-extra-checks.patch + +--- + fs/ext4/ext4.h | 1 + + fs/ext4/mballoc.c | 103 ++++++++++++++++++++++++++++++++++++++++------ + fs/ext4/mballoc.h | 2 +- + 3 files changed, 93 insertions(+), 13 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 89e289c..5707cba 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -3424,6 +3424,7 @@ struct ext4_group_info { + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + ext4_group_t bb_group; /* Group number */ + struct list_head bb_prealloc_list; ++ unsigned long bb_prealloc_nr; + #ifdef DOUBLE_CHECK + void *bb_bitmap; + #endif +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index 06c583d..a2b5d56 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -399,7 +399,7 @@ static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { + "ext4_groupinfo_64k", "ext4_groupinfo_128k" + }; + +-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); + static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group); +@@ -1101,7 +1101,7 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) + } + + static noinline_for_stack +-void ext4_mb_generate_buddy(struct super_block *sb, ++int ext4_mb_generate_buddy(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); +@@ -1145,6 +1145,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, + grp->bb_free = free; + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ++ return -EIO; + } + mb_set_largest_free_order(sb, grp); + +@@ -1154,6 +1155,8 @@ void ext4_mb_generate_buddy(struct super_block *sb, + atomic_inc(&sbi->s_mb_buddies_generated); + atomic64_add(period, &sbi->s_mb_generation_time); + mb_update_avg_fragment_size(sb, grp); ++ ++ return 0; + } + + /* The buddy information is attached the buddy cache inode +@@ -1256,7 +1259,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) + } + + first_block = page->index * blocks_per_page; +- for (i = 0; i < blocks_per_page; i++) { ++ for (i = 0; i < blocks_per_page && err == 0; i++) { + group = (first_block + i) >> 1; + if (group >= ngroups) + break; +@@ -1300,7 +1303,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) + ext4_lock_group(sb, group); + /* init the buddy */ + memset(data, 0xff, blocksize); +- ext4_mb_generate_buddy(sb, data, incore, group); ++ err = ext4_mb_generate_buddy(sb, data, incore, group); + ext4_unlock_group(sb, group); + incore = NULL; + } else { +@@ -1315,7 +1318,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) + memcpy(data, bitmap, blocksize); + + /* mark all preallocated blks used in in-core bitmap */ +- ext4_mb_generate_from_pa(sb, data, group); ++ err = ext4_mb_generate_from_pa(sb, data, group); + ext4_mb_generate_from_freelist(sb, data, group); + ext4_unlock_group(sb, group); + +@@ -1325,7 +1328,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) + incore = data; + } + } +- SetPageUptodate(page); ++ if (likely(err == 0)) ++ SetPageUptodate(page); + + out: + if (bh) { +@@ -2840,9 +2844,11 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) + static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) + { + struct super_block *sb = PDE_DATA(file_inode(seq->file)); ++ struct ext4_group_desc *gdp; + ext4_group_t group = (ext4_group_t) ((unsigned long) v); + int i; + int err, buddy_loaded = 0; ++ int free = 0; + struct ext4_buddy e4b; + struct ext4_group_info *grinfo; + unsigned char blocksize_bits = min_t(unsigned char, +@@ -2855,7 +2861,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) + + group--; + if (group == 0) +- seq_puts(seq, "#group: free frags first [" ++ seq_puts(seq, "#group: bfree gfree frags first pa [" + " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " + " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); + +@@ -2873,13 +2879,19 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) + buddy_loaded = 1; + } + ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ if (gdp != NULL) ++ free = ext4_free_group_clusters(sb, gdp); ++ + memcpy(&sg, ext4_get_group_info(sb, group), i); + + if (buddy_loaded) + ext4_mb_unload_buddy(&e4b); + +- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, +- sg.info.bb_fragments, sg.info.bb_first_free); ++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u %-5u %-5lu [", ++ (long unsigned int)group, sg.info.bb_free, free, ++ sg.info.bb_fragments, sg.info.bb_first_free, ++ sg.info.bb_prealloc_nr); + for (i = 0; i <= 13; i++) + seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? + sg.info.bb_counters[i] : 0); +@@ -4476,23 +4488,72 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + return; + } + ++/* ++ * check free blocks in bitmap match free block in group descriptor ++ * do this before taking preallocated blocks into account to be able ++ * to detect on-disk corruptions. The group lock should be hold by the ++ * caller. ++ */ ++int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap, ++ struct ext4_group_desc *gdp, int group) ++{ ++ unsigned short max = EXT4_CLUSTERS_PER_GROUP(sb); ++ unsigned short i, first, free = 0; ++ unsigned short free_in_gdp = ext4_free_group_clusters(sb, gdp); ++ ++ if (free_in_gdp == 0 && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) ++ return 0; ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ ++ while (i < max) { ++ first = i; ++ i = mb_find_next_bit(bitmap, max, i); ++ if (i > max) ++ i = max; ++ free += i - first; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ ++ if (free != free_in_gdp) { ++ ext4_error(sb, "on-disk bitmap for group %d" ++ "corrupted: %u blocks free in bitmap, %u - in gd\n", ++ group, free, free_in_gdp); ++ return -EIO; ++ } ++ return 0; ++} ++ + /* + * the function goes through all preallocation in this group and marks them + * used in in-core bitmap. buddy must be generated from this bitmap + * Need to be called with ext4 group lock held + */ + static noinline_for_stack +-void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_prealloc_space *pa; ++ struct ext4_group_desc *gdp; + struct list_head *cur; + ext4_group_t groupnr; + ext4_grpblk_t start; + int preallocated = 0; ++ int skip = 0, count = 0; ++ int err; + int len; + ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ if (gdp == NULL) ++ return -EIO; ++ ++ /* before applying preallocations, check bitmap consistency */ ++ err = ext4_mb_check_ondisk_bitmap(sb, bitmap, gdp, group); ++ if (err) ++ return err; ++ + /* all form of preallocation discards first load group, + * so the only competing code is preallocation use. + * we don't need any locking here +@@ -4508,13 +4569,23 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + &groupnr, &start); + len = pa->pa_len; + spin_unlock(&pa->pa_lock); +- if (unlikely(len == 0)) ++ if (unlikely(len == 0)) { ++ skip++; + continue; ++ } + BUG_ON(groupnr != group); + ext4_set_bits(bitmap, start, len); + preallocated += len; ++ count++; ++ } ++ if (count + skip != grp->bb_prealloc_nr) { ++ ext4_error(sb, "lost preallocations: " ++ "count %d, bb_prealloc_nr %lu, skip %d\n", ++ count, grp->bb_prealloc_nr, skip); ++ return -EIO; + } + mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); ++ return 0; + } + + static void ext4_mb_mark_pa_deleted(struct super_block *sb, +@@ -4598,6 +4669,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, + */ + ext4_lock_group(sb, grp); + list_del(&pa->pa_group_list); ++ ext4_get_group_info(sb, grp)->bb_prealloc_nr--; + ext4_unlock_group(sb, grp); + + spin_lock(pa->pa_obj_lock); +@@ -4689,6 +4761,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) + pa->pa_inode = ac->ac_inode; + + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + + spin_lock(pa->pa_obj_lock); + list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); +@@ -4744,6 +4817,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) + pa->pa_inode = NULL; + + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + + /* + * We will later add the new pa to the right bucket +@@ -4914,6 +4988,8 @@ repeat: + + spin_unlock(&pa->pa_lock); + ++ BUG_ON(grp->bb_prealloc_nr == 0); ++ grp->bb_prealloc_nr--; + list_del(&pa->pa_group_list); + list_add(&pa->u.pa_tmp_list, &list); + } +@@ -5050,7 +5126,7 @@ repeat: + if (err) { + ext4_error_err(sb, -err, "Error %d loading buddy information for %u", + err, group); +- continue; ++ return; + } + + bitmap_bh = ext4_read_block_bitmap(sb, group); +@@ -5063,6 +5139,8 @@ repeat: + } + + ext4_lock_group(sb, group); ++ BUG_ON(e4b.bd_info->bb_prealloc_nr == 0); ++ e4b.bd_info->bb_prealloc_nr--; + list_del(&pa->pa_group_list); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); + ext4_unlock_group(sb, group); +@@ -5357,6 +5435,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, + } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); ++ ext4_get_group_info(sb, group)->bb_prealloc_nr--; + ext4_mb_release_group_pa(&e4b, pa); + ext4_unlock_group(sb, group); + +diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h +index 39da92c..b6c4a30 100644 +--- a/fs/ext4/mballoc.h ++++ b/fs/ext4/mballoc.h +@@ -66,7 +66,7 @@ + /* + * for which requests use 2^N search using buddies + */ +-#define MB_DEFAULT_ORDER2_REQS 2 ++#define MB_DEFAULT_ORDER2_REQS 8 + + /* + * default group prealloc size 512 blocks +-- +2.31.1 + --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-5.14/ext4-prealloc.patch @@ -0,0 +1,400 @@ +Subject: [PATCH] linux-5.14/ext4-prealloc.patch + +--- + fs/ext4/ext4.h | 7 +- + fs/ext4/inode.c | 3 + + fs/ext4/mballoc.c | 219 +++++++++++++++++++++++++++++++++++----------- + fs/ext4/sysfs.c | 8 +- + 4 files changed, 182 insertions(+), 55 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 2f1249b..4aa8ae4 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1294,6 +1294,8 @@ extern void ext4_set_bits(void *bm, int cur, int len); + /* Metadata checksum algorithm codes */ + #define EXT4_CRC32C_CHKSUM 1 + ++#define EXT4_MAX_PREALLOC_TABLE 64 ++ + /* + * Structure of the super block + */ +@@ -1557,11 +1559,13 @@ struct ext4_sb_info { + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_max_linear_groups; +- unsigned int s_mb_stream_request; ++ unsigned long s_mb_small_req; ++ unsigned long s_mb_large_req; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; ++ unsigned long *s_mb_prealloc_table; + unsigned int s_mb_group_prealloc; + unsigned int s_mb_max_inode_prealloc; + unsigned int s_max_dir_size_kb; +@@ -2888,6 +2892,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid); + int __init ext4_fc_init_dentry_cache(void); + + /* mballoc.c */ ++extern const struct proc_ops ext4_seq_prealloc_table_fops; + extern const struct seq_operations ext4_mb_seq_groups_ops; + extern const struct seq_operations ext4_mb_seq_structs_summary_ops; + extern long ext4_mb_stats; +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index bf6a22c..bffdf63 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -2730,6 +2730,9 @@ static int ext4_writepages(struct address_space *mapping, + PAGE_SIZE >> inode->i_blkbits); + } + ++ if (wbc->nr_to_write < sbi->s_mb_small_req) ++ wbc->nr_to_write = sbi->s_mb_small_req; ++ + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index 349fabe..06c583d 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -3050,6 +3050,99 @@ const struct seq_operations ext4_mb_seq_structs_summary_ops = { + .show = ext4_mb_seq_structs_summary_show, + }; + ++static int ext4_mb_check_and_update_prealloc(struct ext4_sb_info *sbi, ++ char *str, size_t cnt, ++ int update) ++{ ++ unsigned long value; ++ unsigned long prev = 0; ++ char *cur; ++ char *next; ++ char *end; ++ int num = 0; ++ ++ cur = str; ++ end = str + cnt; ++ while (cur < end) { ++ while ((cur < end) && (*cur == ' ')) cur++; ++ value = simple_strtol(cur, &next, 0); ++ if (value == 0) ++ break; ++ if (cur == next) ++ return -EINVAL; ++ ++ cur = next; ++ ++ if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group)) ++ return -EINVAL; ++ ++ /* they should add values in order */ ++ if (value <= prev) ++ return -EINVAL; ++ ++ if (update) ++ sbi->s_mb_prealloc_table[num] = value; ++ ++ prev = value; ++ num++; ++ } ++ ++ if (num > EXT4_MAX_PREALLOC_TABLE - 1) ++ return -EOVERFLOW; ++ ++ if (update) ++ sbi->s_mb_prealloc_table[num] = 0; ++ ++ return 0; ++} ++ ++static ssize_t ext4_mb_prealloc_table_proc_write(struct file *file, ++ const char __user *buf, ++ size_t cnt, loff_t *pos) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file))); ++ char str[128]; ++ int rc; ++ ++ if (cnt >= sizeof(str)) ++ return -EINVAL; ++ if (copy_from_user(str, buf, cnt)) ++ return -EFAULT; ++ ++ rc = ext4_mb_check_and_update_prealloc(sbi, str, cnt, 0); ++ if (rc) ++ return rc; ++ ++ rc = ext4_mb_check_and_update_prealloc(sbi, str, cnt, 1); ++ return rc ? rc : cnt; ++} ++ ++static int mb_prealloc_table_seq_show(struct seq_file *m, void *v) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(m->private); ++ int i; ++ ++ for (i = 0; i < EXT4_MAX_PREALLOC_TABLE && ++ sbi->s_mb_prealloc_table[i] != 0; i++) ++ seq_printf(m, "%ld ", sbi->s_mb_prealloc_table[i]); ++ seq_printf(m, "\n"); ++ ++ return 0; ++} ++ ++static int mb_prealloc_table_seq_open(struct inode *inode, struct file *file) ++{ ++ return single_open(file, mb_prealloc_table_seq_show, PDE_DATA(inode)); ++} ++ ++const struct proc_ops ext4_seq_prealloc_table_fops = { ++ .proc_open = mb_prealloc_table_seq_open, ++ .proc_read = seq_read, ++ .proc_lseek = seq_lseek, ++ .proc_release = single_release, ++ .proc_write = ext4_mb_prealloc_table_proc_write, ++}; ++ + static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) + { + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; +@@ -3311,7 +3404,7 @@ static int ext4_groupinfo_create_slab(size_t size) + int ext4_mb_init(struct super_block *sb) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); +- unsigned i, j; ++ unsigned i, j, k, l; + unsigned offset, offset_incr; + unsigned max; + int ret; +@@ -3380,7 +3473,6 @@ int ext4_mb_init(struct super_block *sb) + sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; +- sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC; + /* +@@ -3405,9 +3497,29 @@ int ext4_mb_init(struct super_block *sb) + * RAID stripe size so that preallocations don't fragment + * the stripes. + */ +- if (sbi->s_stripe > 1) { +- sbi->s_mb_group_prealloc = roundup( +- sbi->s_mb_group_prealloc, sbi->s_stripe); ++ ++ /* Allocate table once */ ++ sbi->s_mb_prealloc_table = kzalloc( ++ EXT4_MAX_PREALLOC_TABLE * sizeof(unsigned long), GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ if (sbi->s_stripe == 0) { ++ for (k = 0, l = 4; k <= 9; ++k, l *= 2) ++ sbi->s_mb_prealloc_table[k] = l; ++ ++ sbi->s_mb_small_req = 256; ++ sbi->s_mb_large_req = 1024; ++ sbi->s_mb_group_prealloc = 512; ++ } else { ++ for (k = 0, l = sbi->s_stripe; k <= 2; ++k, l *= 2) ++ sbi->s_mb_prealloc_table[k] = l; ++ ++ sbi->s_mb_small_req = sbi->s_stripe; ++ sbi->s_mb_large_req = sbi->s_stripe * 8; ++ sbi->s_mb_group_prealloc = sbi->s_stripe * 4; + } + + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); +@@ -3441,6 +3553,7 @@ out_free_locality_groups: + out: + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); ++ kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); + sbi->s_mb_offsets = NULL; + kfree(sbi->s_mb_maxs); +@@ -3713,7 +3826,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, + int err, len; + + BUG_ON(ac->ac_status != AC_STATUS_FOUND); +- BUG_ON(ac->ac_b_ex.fe_len <= 0); + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); +@@ -3922,13 +4034,14 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, + struct ext4_allocation_request *ar) + { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); +- int bsbits, max; ++ int bsbits, i, wind; + ext4_lblk_t end; +- loff_t size, start_off; ++ loff_t size; + loff_t orig_size __maybe_unused; + ext4_lblk_t start; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_prealloc_space *pa; ++ unsigned long value, last_non_zero; + + /* do normalize only data requests, metadata requests + do not need preallocation */ +@@ -3957,51 +4070,46 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, + size = size << bsbits; + if (size < i_size_read(ac->ac_inode)) + size = i_size_read(ac->ac_inode); +- orig_size = size; ++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits; ++ ++ start = wind = 0; ++ value = last_non_zero = 0; ++ ++ /* let's choose preallocation window depending on file size */ ++ for (i = 0; i < EXT4_MAX_PREALLOC_TABLE; i++) { ++ value = sbi->s_mb_prealloc_table[i]; ++ if (value == 0) ++ break; ++ else ++ last_non_zero = value; + +- /* max size of free chunks */ +- max = 2 << bsbits; +- +-#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ +- (req <= (size) || max <= (chunk_size)) +- +- /* first, try to predict filesize */ +- /* XXX: should this table be tunable? */ +- start_off = 0; +- if (size <= 16 * 1024) { +- size = 16 * 1024; +- } else if (size <= 32 * 1024) { +- size = 32 * 1024; +- } else if (size <= 64 * 1024) { +- size = 64 * 1024; +- } else if (size <= 128 * 1024) { +- size = 128 * 1024; +- } else if (size <= 256 * 1024) { +- size = 256 * 1024; +- } else if (size <= 512 * 1024) { +- size = 512 * 1024; +- } else if (size <= 1024 * 1024) { +- size = 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (21 - bsbits)) << 21; +- size = 2 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (22 - bsbits)) << 22; +- size = 4 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, +- (8<<20)>>bsbits, max, 8 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (23 - bsbits)) << 23; +- size = 8 * 1024 * 1024; ++ if (size <= value) { ++ wind = value; ++ break; ++ } ++ } ++ ++ if (wind == 0) { ++ if (last_non_zero != 0) { ++ __u64 tstart, tend; ++ /* file is quite large, we now preallocate with ++ * the biggest configured window with regart to ++ * logical offset */ ++ wind = last_non_zero; ++ tstart = ac->ac_o_ex.fe_logical; ++ do_div(tstart, wind); ++ start = tstart * wind; ++ tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1; ++ do_div(tend, wind); ++ tend = tend * wind + wind; ++ size = tend - start; ++ } + } else { +- start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; +- size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb), +- ac->ac_o_ex.fe_len) << bsbits; ++ size = wind; + } +- size = size >> bsbits; +- start = start_off >> bsbits; ++ ++ ++ orig_size = size; + + /* don't cover already allocated blocks in selected range */ + if (ar->pleft && start <= ar->lleft) { +@@ -4083,7 +4191,6 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, + (unsigned long) ac->ac_o_ex.fe_logical); + BUG(); + } +- BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + + /* now prepare goal request */ + +@@ -5101,11 +5208,19 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) + + /* don't use group allocation for large files */ + size = max(size, isize); +- if (size > sbi->s_mb_stream_request) { ++ if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) || ++ (size >= sbi->s_mb_large_req)) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; + } + ++ /* ++ * request is so large that we don't care about ++ * streaming - it overweights any possible seek ++ */ ++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req) ++ return; ++ + BUG_ON(ac->ac_lg != NULL); + /* + * locality group prealloc space are per cpu. The reason for having +diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c +index 2314f74..5fc52fc 100644 +--- a/fs/ext4/sysfs.c ++++ b/fs/ext4/sysfs.c +@@ -212,7 +212,8 @@ EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); + EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +-EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); ++EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req); ++EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req); + EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); + EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc); + EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); +@@ -261,7 +262,8 @@ static struct attribute *ext4_attrs[] = { + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), +- ATTR_LIST(mb_stream_req), ++ ATTR_LIST(mb_small_req), ++ ATTR_LIST(mb_large_req), + ATTR_LIST(mb_group_prealloc), + ATTR_LIST(mb_max_inode_prealloc), + ATTR_LIST(mb_max_linear_groups), +@@ -541,6 +543,8 @@ int ext4_register_sysfs(struct super_block *sb) + ext4_fc_info_show, sb); + proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_ops, sb); ++ proc_create_data("prealloc_table", S_IRUGO, sbi->s_proc, ++ &ext4_seq_prealloc_table_fops, sb); + proc_create_single_data("mb_stats", 0444, sbi->s_proc, + ext4_seq_mb_stats_show, sb); + proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc, +-- +2.31.1 + --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-5.14/ext4-print-inum-in-htree-warning.patch @@ -0,0 +1,23 @@ +--- + fs/ext4/namei.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index 87cefa6..caf84b5 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -794,8 +794,9 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + root->info.hash_version != DX_HASH_HALF_MD4 && + root->info.hash_version != DX_HASH_LEGACY && + root->info.hash_version != DX_HASH_SIPHASH) { +- ext4_warning_inode(dir, "Unrecognised inode hash code %u", +- root->info.hash_version); ++ ext4_warning_inode(dir, ++ "Unrecognised inode hash code %u for directory %lu", ++ root->info.hash_version, dir->i_ino); + goto fail; + } + if (ext4_hash_in_dirent(dir)) { +-- +2.31.1 + --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-5.14/ext4-projid-xattrs.patch @@ -0,0 +1,167 @@ +Subject: [PATCH] P: linux-5.14/ext4-projid-xattrs.patch + +--- + fs/ext4/ext4.h | 1 + + fs/ext4/ioctl.c | 4 +-- + fs/ext4/xattr.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 87 insertions(+), 2 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index afca42d..9ba2e8b 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -3164,6 +3164,7 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end); + + /* ioctl.c */ ++extern int ext4_ioctl_setproject(struct inode *, __u32); + extern long ext4_ioctl(struct file *, unsigned int, unsigned long); + extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); + int ext4_fileattr_set(struct user_namespace *mnt_userns, +diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c +index 6eed617..88cb1fb 100644 +--- a/fs/ext4/ioctl.c ++++ b/fs/ext4/ioctl.c +@@ -461,7 +461,7 @@ flags_out: + } + + #ifdef CONFIG_QUOTA +-static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) ++int ext4_ioctl_setproject(struct inode *inode, __u32 projid) + { + struct super_block *sb = inode->i_sb; + struct ext4_inode_info *ei = EXT4_I(inode); +@@ -546,7 +546,7 @@ out_stop: + return err; + } + #else +-static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) ++int ext4_ioctl_setproject(struct inode *inode, __u32 projid) + { + if (projid != EXT4_DEF_PROJID) + return -EOPNOTSUPP; +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index aabc92e..442bfbd 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -62,6 +62,8 @@ + #include "xattr.h" + #include "acl.h" + ++#define EXT4_XATTR_PROJID "projid" ++ + #ifdef EXT4_XATTR_DEBUG + # define ea_idebug(inode, fmt, ...) \ + printk(KERN_DEBUG "inode %s:%lu: " fmt "\n", \ +@@ -648,11 +650,30 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name, + return -ERANGE; + + down_read(&EXT4_I(inode)->xattr_sem); ++ if (name_index == EXT4_XATTR_INDEX_TRUSTED && ++ strncmp(name, EXT4_XATTR_PROJID, strlen(name)) == 0 && ++ ext4_has_feature_project(inode->i_sb)) { ++ /* 10 chars to hold u32 in decimal, plus ending \0 */ ++ char value[11]; ++ __u32 projid = (__u32)from_kprojid(&init_user_ns, ++ EXT4_I(inode)->i_projid); ++ error = snprintf(value, sizeof(value), "%u", projid); ++ if (buffer) { ++ if (error > buffer_size) { ++ error = -ERANGE; ++ goto out; ++ } ++ memcpy(buffer, value, error); ++ } ++ goto out; ++ } ++ + error = ext4_xattr_ibody_get(inode, name_index, name, buffer, + buffer_size); + if (error == -ENODATA) + error = ext4_xattr_block_get(inode, name_index, name, buffer, + buffer_size); ++out: + up_read(&EXT4_I(inode)->xattr_sem); + return error; + } +@@ -775,7 +796,33 @@ ext4_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) + ret = ext4_xattr_block_list(dentry, buffer, buffer_size); + if (ret < 0) + goto errout; ++ if (buffer) { ++ buffer += ret; ++ buffer_size -= ret; ++ } + ret += ret2; ++ if (ext4_has_feature_project(dentry->d_sb)) { ++ size_t prefix_len = strlen(XATTR_TRUSTED_PREFIX); ++ size_t name_len = strlen(EXT4_XATTR_PROJID); ++ size_t size = prefix_len + name_len + 1; ++ ++ if (__kprojid_val(EXT4_I(dentry->d_inode)->i_projid) == ++ EXT4_DEF_PROJID) ++ goto errout; ++ if (buffer) { ++ if (size > buffer_size) { ++ ret = -ERANGE; ++ goto errout; ++ } ++ strncpy(buffer, XATTR_TRUSTED_PREFIX, prefix_len); ++ buffer += prefix_len; ++ strncpy(buffer, EXT4_XATTR_PROJID, name_len); ++ buffer += name_len; ++ *buffer++ = 0; ++ buffer_size -= size; ++ } ++ ret += size; ++ } + errout: + up_read(&EXT4_I(d_inode(dentry))->xattr_sem); + return ret; +@@ -2456,6 +2503,43 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, + int error, retries = 0; + int credits; + ++ if (name_index == EXT4_XATTR_INDEX_TRUSTED && ++ strncmp(name, EXT4_XATTR_PROJID, strlen(name)) == 0 && ++ ext4_has_feature_project(inode->i_sb)) { ++ /* 10 chars to hold u32 in decimal, plus ending \0 */ ++ char buffer[11]; ++ __u32 projid; ++ ++ /* ++ * Project Quota ID state is only allowed to change from within ++ * the init namespace. ++ */ ++ if (current_user_ns() != &init_user_ns) ++ return -EINVAL; ++ ++ if (value && value_len) { ++ if (value_len >= sizeof(buffer)) ++ return -EINVAL; ++ memcpy(buffer, value, value_len); ++ buffer[value_len] = '\0'; ++ error = kstrtouint(buffer, 0, &projid); ++ if (error) ++ return error; ++ } else { ++ projid = EXT4_DEF_PROJID; ++ } ++ ++ /* ++ * Caller is allowed to change the project ID. If it is being ++ * changed, make sure that the new value is valid. ++ */ ++ if (!projid_valid(make_kprojid(&init_user_ns, projid))) ++ return -EINVAL; ++ ++ error = ext4_ioctl_setproject(inode, projid); ++ return error; ++ } ++ + error = dquot_initialize(inode); + if (error) + return error; +-- +2.27.0 + --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-5.14/ext4-xattr-disable-credits-check.patch @@ -0,0 +1,24 @@ +Subject: [PATCH] ext4-xattr-disable-credits-check + +--- + fs/ext4/xattr.c | 4 ---- + 1 file changed, 4 deletions(-) + +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index 850b9cf..f29de20 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -2387,10 +2387,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + flags & XATTR_CREATE); + brelse(bh); + +- if (jbd2_handle_buffer_credits(handle) < credits) { +- error = -ENOSPC; +- goto cleanup; +- } + WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); + } + +-- +2.27.0 + --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles15sp4/ext4-data-in-dirent.patch @@ -0,0 +1,884 @@ +Subject: [PATCH] This patch implements feature which allows ext4 fs users + (e.g. Lustre) to store data in ext4 dirent. + +Data is stored in ext4 dirent after file-name, this space is accounted +in de->rec_len. Flag EXT4_DIRENT_LUFID is added to d_type when extra +data is present. + +This uses dentry->d_fsdata to pass fid to ext4. so no +changes in ext4_add_entry() interface required. +--- + fs/ext4/dir.c | 9 +- + fs/ext4/ext4.h | 106 ++++++++++++++++-- + fs/ext4/fast_commit.c | 2 +- + fs/ext4/inline.c | 8 +- + fs/ext4/namei.c | 249 ++++++++++++++++++++++++++++++++---------- + fs/ext4/super.c | 4 +- + 6 files changed, 303 insertions(+), 75 deletions(-) + +diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c +index 74b172a..c6afabc 100644 +--- a/fs/ext4/dir.c ++++ b/fs/ext4/dir.c +@@ -466,12 +466,17 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + struct fname *fname, *new_fn; + struct dir_private_info *info; + int len; ++ int extra_data = 0; + + info = dir_file->private_data; + p = &info->root.rb_node; + + /* Create and allocate the fname structure */ +- len = sizeof(struct fname) + ent_name->len + 1; ++ if (dirent->file_type & EXT4_DIRENT_LUFID) ++ extra_data = ext4_get_dirent_data_len(dirent); ++ ++ len = sizeof(struct fname) + ent_name->len + extra_data + 1; ++ + new_fn = kzalloc(len, GFP_KERNEL); + if (!new_fn) + return -ENOMEM; +@@ -480,7 +485,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + new_fn->inode = le32_to_cpu(dirent->inode); + new_fn->name_len = ent_name->len; + new_fn->file_type = dirent->file_type; +- memcpy(new_fn->name, ent_name->name, ent_name->len); ++ memcpy(new_fn->name, ent_name->name, ent_name->len + extra_data); + + while (*p) { + parent = *p; +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 0791a8b..f1bc21d 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1165,6 +1165,7 @@ struct ext4_inode_info { + __u32 i_csum_seed; + + kprojid_t i_projid; ++ void *i_dirdata; + }; + + /* +@@ -1186,6 +1187,7 @@ struct ext4_inode_info { + * Mount flags set via mount options or defaults + */ + #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ ++#define EXT4_MOUNT_DIRDATA 0x00002 /* Data in directory entries */ + #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ + #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ + #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +@@ -2117,6 +2119,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ ++ EXT4_FEATURE_INCOMPAT_DIRDATA| \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD | \ +@@ -2326,6 +2329,42 @@ struct ext4_dir_entry_tail { + #define EXT4_FT_SYMLINK 7 + + #define EXT4_FT_MAX 8 ++#define EXT4_FT_MASK 0xf ++ ++#if EXT4_FT_MAX > EXT4_FT_MASK ++#error "conflicting EXT4_FT_MAX and EXT4_FT_MASK" ++#endif ++ ++/* ++ * d_type has 4 unused bits, so it can hold four types data. these different ++ * type of data (e.g. lustre data, high 32 bits of 64-bit inode number) can be ++ * stored, in flag order, after file-name in ext4 dirent. ++*/ ++/* ++ * this flag is added to d_type if ext4 dirent has extra data after ++ * filename. this data length is variable and length is stored in first byte ++ * of data. data start after filename NUL byte. ++ * This is used by Lustre FS. ++ */ ++#define EXT4_DIRENT_LUFID 0x10 ++ ++#define EXT4_LUFID_MAGIC 0xAD200907UL ++struct ext4_dentry_param { ++ __u32 edp_magic; /* EXT4_LUFID_MAGIC */ ++ char edp_len; /* size of edp_data in bytes */ ++ char edp_data[0]; /* packed array of data */ ++} __packed; ++ ++static inline unsigned char *ext4_dentry_get_data(struct super_block *sb, ++ struct ext4_dentry_param *p) ++{ ++ if (!ext4_has_feature_dirdata(sb)) ++ return NULL; ++ if (p && p->edp_magic == EXT4_LUFID_MAGIC) ++ return &p->edp_len; ++ else ++ return NULL; ++} + + #define EXT4_FT_DIR_CSUM 0xDE + +@@ -2337,6 +2376,17 @@ struct ext4_dir_entry_tail { + #define EXT4_DIR_PAD 4 + #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) + #define EXT4_MAX_REC_LEN ((1<<16)-1) ++#define EXT4_DIR_REC_LEN_(name_len, i_dir) \ ++ ext4_dir_rec_len((name_len), (i_dir)) ++#define EXT4_DIR_ENTRY_LEN_(de, i_dir) \ ++ (EXT4_DIR_REC_LEN_((de)->name_len + ext4_get_dirent_data_len(de), \ ++ (i_dir))) ++/* ldiskfs */ ++#define EXT4_DIR_REC_LEN(name_len, i_dir) EXT4_DIR_REC_LEN_((name_len), (i_dir)) ++#define EXT4_DIR_ENTRY_LEN(de, i_dir) EXT4_DIR_ENTRY_LEN_((de), (i_dir)) ++/* lustre osd_handler compat -- ifdef LDISKFS_DIR_REC_LEN_WITH_DIR */ ++#define EXT4_DIR_REC_LEN_WITH_DIR 1 ++#define __EXT4_DIR_REC_LEN(name_len) EXT4_DIR_REC_LEN_((name_len), NULL) + + /* + * The rec_len is dependent on the type of directory. Directories that are +@@ -2344,10 +2394,10 @@ struct ext4_dir_entry_tail { + * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should + * pass NULL for dir, as those entries do not use the extra fields. + */ +-static inline unsigned int ext4_dir_rec_len(__u8 name_len, ++static inline unsigned int ext4_dir_rec_len(__u32 name_len, + const struct inode *dir) + { +- int rec_len = (name_len + 8 + EXT4_DIR_ROUND); ++ __u32 rec_len = (name_len + 8 + EXT4_DIR_ROUND); + + if (dir && ext4_hash_in_dirent(dir)) + rec_len += sizeof(struct ext4_dir_entry_hash); +@@ -2821,11 +2871,13 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **dest_de); ++ struct ext4_dir_entry_2 **dest_de, ++ int *dlen); + void ext4_insert_dentry(struct inode *dir, struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, +- struct ext4_filename *fname); ++ struct ext4_filename *fname, ++ void *data); + static inline void ext4_update_dx_flag(struct inode *inode) + { + if (!ext4_has_feature_dir_index(inode->i_sb) && +@@ -2841,10 +2893,17 @@ static const unsigned char ext4_filetype_table[] = { + + static inline unsigned char get_dtype(struct super_block *sb, int filetype) + { +- if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) ++ int fl_index = filetype & EXT4_FT_MASK; ++ ++ if (!ext4_has_feature_filetype(sb) || fl_index >= EXT4_FT_MAX) + return DT_UNKNOWN; + +- return ext4_filetype_table[filetype]; ++ if (!test_opt(sb, DIRDATA)) ++ return ext4_filetype_table[fl_index]; ++ ++ return (ext4_filetype_table[fl_index]) | ++ (filetype & EXT4_DIRENT_LUFID); ++ + } + extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); +@@ -3048,7 +3107,8 @@ extern int ext4_ind_migrate(struct inode *inode); + + /* namei.c */ + extern int ext4_init_new_dir(handle_t *handle, struct inode *dir, +- struct inode *inode); ++ struct inode *inode, ++ const void *data1, const void *data2); + extern int ext4_dirblock_csum_verify(struct inode *inode, + struct buffer_head *bh); + extern int ext4_orphan_add(handle_t *, struct inode *); +@@ -3059,6 +3119,8 @@ extern struct inode *ext4_create_inode(handle_t *handle, + extern int ext4_delete_entry(handle_t *handle, struct inode * dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh); ++extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, ++ struct inode *inode, const void *, const void *); + extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); + extern int ext4_search_dir(struct buffer_head *bh, +@@ -3862,6 +3924,36 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) + return buffer_uptodate(bh); + } + ++/* ++ * Compute the total directory entry data length. ++ * This includes the filename and an implicit NUL terminator (always present), ++ * and optional extensions. Each extension has a bit set in the high 4 bits of ++ * de->file_type, and the extension length is the first byte in each entry. ++ */ ++static inline int ext4_get_dirent_data_len(struct ext4_dir_entry_2 *de) ++{ ++ char *len = de->name + de->name_len + 1 /* NUL terminator */; ++ int dlen = 0; ++ __u8 extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4; ++ struct ext4_dir_entry_tail *t = (struct ext4_dir_entry_tail *)de; ++ ++ if (!t->det_reserved_zero1 && ++ le16_to_cpu(t->det_rec_len) == ++ sizeof(struct ext4_dir_entry_tail) && ++ !t->det_reserved_zero2 && ++ t->det_reserved_ft == EXT4_FT_DIR_CSUM) ++ return 0; ++ ++ while (extra_data_flags) { ++ if (extra_data_flags & 1) { ++ dlen += *len + (dlen == 0); ++ len += *len; ++ } ++ extra_data_flags >>= 1; ++ } ++ return dlen; ++} ++ + #endif /* __KERNEL__ */ + + #define EFSBADCRC EBADMSG /* Bad CRC detected */ +diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c +index 276d9e6..3be0f08 100644 +--- a/fs/ext4/fast_commit.c ++++ b/fs/ext4/fast_commit.c +@@ -1596,7 +1596,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, + jbd_debug(1, "Dir %d not found.", darg.ino); + goto out; + } +- ret = ext4_init_new_dir(NULL, dir, inode); ++ ret = ext4_init_new_dir(NULL, dir, inode, NULL, NULL); + iput(dir); + if (ret) { + ret = 0; +diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c +index 9626c31..ed31b5c 100644 +--- a/fs/ext4/inline.c ++++ b/fs/ext4/inline.c +@@ -1029,7 +1029,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, + struct ext4_dir_entry_2 *de; + + err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, +- inline_size, fname, &de); ++ inline_size, fname, &de, NULL); + if (err) + return err; + +@@ -1038,7 +1038,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, + EXT4_JTR_NONE); + if (err) + return err; +- ext4_insert_dentry(dir, inode, de, inline_size, fname); ++ ext4_insert_dentry(dir, inode, de, inline_size, fname, NULL); + + ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); + +@@ -1396,7 +1396,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, + fake.name_len = 1; + strcpy(fake.name, "."); + fake.rec_len = ext4_rec_len_to_disk( +- ext4_dir_rec_len(fake.name_len, NULL), ++ EXT4_DIR_ENTRY_LEN(&fake, NULL), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; +@@ -1406,7 +1406,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, + fake.name_len = 2; + strcpy(fake.name, ".."); + fake.rec_len = ext4_rec_len_to_disk( +- ext4_dir_rec_len(fake.name_len, NULL), ++ EXT4_DIR_ENTRY_LEN(&fake, NULL), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index 7f00dc3..51c950b 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -285,13 +285,14 @@ static unsigned dx_get_count(struct dx_entry *entries); + static unsigned dx_get_limit(struct dx_entry *entries); + static void dx_set_count(struct dx_entry *entries, unsigned value); + static void dx_set_limit(struct dx_entry *entries, unsigned value); +-static unsigned dx_root_limit(struct inode *dir, unsigned infosize); ++static inline unsigned dx_root_limit(struct inode *dir, ++ struct ext4_dir_entry_2 *dot_de, unsigned infosize); + static unsigned dx_node_limit(struct inode *dir); + static struct dx_frame *dx_probe(struct ext4_filename *fname, + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame); +-static void dx_release(struct dx_frame *frames); ++static void dx_release(struct dx_frame *frames, struct inode *dir); + static int dx_make_map(struct inode *dir, struct buffer_head *bh, + struct dx_hash_info *hinfo, + struct dx_map_entry *map_tail); +@@ -431,22 +432,23 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, + { + struct ext4_dir_entry *dp; + struct dx_root_info *root; +- int count_offset; ++ int count_offset, dot_rec_len, dotdot_rec_len; + + if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb)) + count_offset = 8; +- else if (le16_to_cpu(dirent->rec_len) == 12) { +- dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); ++ else { ++ dot_rec_len = le16_to_cpu(dirent->rec_len); ++ dp = (struct ext4_dir_entry *)(((void *)dirent) + dot_rec_len); + if (le16_to_cpu(dp->rec_len) != +- EXT4_BLOCK_SIZE(inode->i_sb) - 12) ++ EXT4_BLOCK_SIZE(inode->i_sb) - dot_rec_len) + return NULL; +- root = (struct dx_root_info *)(((void *)dp + 12)); ++ dotdot_rec_len = EXT4_DIR_ENTRY_LEN((struct ext4_dir_entry_2 *)dp, NULL); ++ root = (struct dx_root_info *)(((void *)dp + dotdot_rec_len)); + if (root->reserved_zero || + root->info_length != sizeof(struct dx_root_info)) + return NULL; +- count_offset = 32; +- } else +- return NULL; ++ count_offset = 8 + dot_rec_len + dotdot_rec_len; ++ } + + if (offset) + *offset = count_offset; +@@ -549,13 +551,14 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ +-struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de) ++struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de, struct inode *i_dir) + { ++ BUG_ON(de->name_len != 1); + /* get dotdot first */ +- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1)); ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_ENTRY_LEN(de, i_dir)); + + /* dx root info is after dotdot entry */ +- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2)); ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_ENTRY_LEN(de, i_dir)); + + return (struct dx_root_info *)de; + } +@@ -600,11 +603,16 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value) + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); + } + +-static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) ++static inline unsigned dx_root_limit(struct inode *dir, ++ struct ext4_dir_entry_2 *dot_de, unsigned infosize) + { +- unsigned int entry_space = dir->i_sb->s_blocksize - +- ext4_dir_rec_len(1, NULL) - +- ext4_dir_rec_len(2, NULL) - infosize; ++ struct ext4_dir_entry_2 *dotdot_de; ++ unsigned entry_space; ++ ++ BUG_ON(dot_de->name_len != 1); ++ dotdot_de = ext4_next_entry(dot_de, dir->i_sb->s_blocksize); ++ entry_space = dir->i_sb->s_blocksize - EXT4_DIR_ENTRY_LEN(dot_de, NULL) - ++ EXT4_DIR_ENTRY_LEN(dotdot_de, NULL) - infosize; + + if (ext4_has_metadata_csum(dir->i_sb)) + entry_space -= sizeof(struct dx_tail); +@@ -722,7 +730,7 @@ static struct stats dx_show_leaf(struct inode *dir, + (unsigned) ((char *) de - base)); + #endif + } +- space += ext4_dir_rec_len(de->name_len, dir); ++ space += EXT4_DIR_ENTRY_LEN(de, dir); + names++; + } + de = ext4_next_entry(de, size); +@@ -816,7 +824,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + if (IS_ERR(frame->bh)) + return (struct dx_frame *) frame->bh; + +- info = dx_get_dx_info((struct ext4_dir_entry_2 *)frame->bh->b_data); ++ info = dx_get_dx_info((struct ext4_dir_entry_2 *)frame->bh->b_data, dir); + if (info->hash_version != DX_HASH_TEA && + info->hash_version != DX_HASH_HALF_MD4 && + info->hash_version != DX_HASH_LEGACY && +@@ -872,11 +880,14 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + + entries = (struct dx_entry *)(((char *)info) + info->info_length); + +- if (dx_get_limit(entries) != dx_root_limit(dir, +- info->info_length)) { ++ if (dx_get_limit(entries) != ++ dx_root_limit(dir, (struct ext4_dir_entry_2 *)frame->bh->b_data, ++ info->info_length)) { + ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", + dx_get_limit(entries), +- dx_root_limit(dir, info->info_length)); ++ dx_root_limit(dir, ++ (struct ext4_dir_entry_2 *)frame->bh->b_data, ++ info->info_length)); + goto fail; + } + +@@ -953,7 +964,7 @@ fail: + return ret_err; + } + +-static void dx_release(struct dx_frame *frames) ++static void dx_release(struct dx_frame *frames, struct inode *dir) + { + struct dx_root_info *info; + int i; +@@ -962,7 +973,7 @@ static void dx_release(struct dx_frame *frames) + if (frames[0].bh == NULL) + return; + +- info = dx_get_dx_info((struct ext4_dir_entry_2 *)frames[0].bh->b_data); ++ info = dx_get_dx_info((struct ext4_dir_entry_2 *)frames[0].bh->b_data, dir); + /* save local copy, "info" may be freed after brelse() */ + indirect_levels = info->indirect_levels; + for (i = 0; i <= indirect_levels; i++) { +@@ -1263,12 +1274,12 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + (count && ((hashval & 1) == 0))) + break; + } +- dx_release(frames); ++ dx_release(frames, dir); + dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, " + "next hash: %x\n", count, *next_hash)); + return count; + errout: +- dx_release(frames); ++ dx_release(frames, dir); + return (err); + } + +@@ -1801,7 +1812,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + errout: + dxtrace(printk(KERN_DEBUG "%s not found\n", fname->usr_fname->name)); + success: +- dx_release(frames); ++ dx_release(frames, dir); + return bh; + } + +@@ -1925,7 +1936,7 @@ dx_move_dirents(struct inode *dir, char *from, char *to, + while (count--) { + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) + (from + (map->offs<<2)); +- rec_len = ext4_dir_rec_len(de->name_len, dir); ++ rec_len = EXT4_DIR_ENTRY_LEN(de, dir); + + memcpy (to, de, rec_len); + ((struct ext4_dir_entry_2 *) to)->rec_len = +@@ -1958,7 +1969,7 @@ static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, + while ((char*)de < base + blocksize) { + next = ext4_next_entry(de, blocksize); + if (de->inode && de->name_len) { +- rec_len = ext4_dir_rec_len(de->name_len, dir); ++ rec_len = EXT4_DIR_ENTRY_LEN(de, dir); + if (de > to) + memmove(to, de, rec_len); + to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); +@@ -2101,14 +2112,21 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **dest_de) ++ struct ext4_dir_entry_2 **dest_de, ++ int *dlen) + { + struct ext4_dir_entry_2 *de; +- unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir); ++ unsigned short reclen; + int nlen, rlen; + unsigned int offset = 0; + char *top; + ++ if (dlen) { ++ reclen = ext4_dir_rec_len(fname_len(fname) + *dlen, dir); ++ *dlen = 0; ++ } else { ++ reclen = ext4_dir_rec_len(fname_len(fname), dir); ++ } + de = (struct ext4_dir_entry_2 *)buf; + top = buf + buf_size - reclen; + while ((char *) de <= top) { +@@ -2117,10 +2135,31 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, + return -EFSCORRUPTED; + if (ext4_match(dir, fname, de)) + return -EEXIST; +- nlen = ext4_dir_rec_len(de->name_len, dir); ++ nlen = EXT4_DIR_ENTRY_LEN(de, dir); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if ((de->inode ? rlen - nlen : rlen) >= reclen) + break; ++ ++ /* Then for dotdot entries, check for the smaller space ++ * required for just the entry, no FID ++ */ ++ if (fname_len(fname) == 2 && memcmp(fname_name(fname), "..", 2) == 0) { ++ if ((de->inode ? rlen - nlen : rlen) >= ++ ext4_dir_rec_len(fname_len(fname), dir)) { ++ /* set dlen = 1 to indicate not ++ * enough space store fid ++ */ ++ if (dlen) ++ *dlen = 1; ++ break; ++ } ++ /* The new ".." entry must be written over the ++ * previous ".." entry, which is the first ++ * entry traversed by this scan. If it doesn't ++ * fit, something is badly wrong, so -EIO. ++ */ ++ return -EIO; ++ } + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } +@@ -2135,12 +2174,13 @@ void ext4_insert_dentry(struct inode *dir, + struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, +- struct ext4_filename *fname) ++ struct ext4_filename *fname, ++ void *data) + { + + int nlen, rlen; + +- nlen = ext4_dir_rec_len(de->name_len, dir); ++ nlen = EXT4_DIR_ENTRY_LEN(de, dir); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = +@@ -2161,6 +2201,12 @@ void ext4_insert_dentry(struct inode *dir, + EXT4_DIRENT_HASHES(de)->minor_hash = + cpu_to_le32(hinfo->minor_hash); + } ++ if (data) { ++ de->name[fname_len(fname)] = 0; ++ memcpy(&de->name[fname_len(fname) + 1], data, *(char *)data); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } ++ + } + + /* +@@ -2178,14 +2224,19 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, + { + unsigned int blocksize = dir->i_sb->s_blocksize; + int csum_size = 0; +- int err, err2; ++ int err, err2, dlen = 0; ++ unsigned char *data; + ++ data = ext4_dentry_get_data(inode->i_sb, (struct ext4_dentry_param *) ++ EXT4_I(inode)->i_dirdata); + if (ext4_has_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + if (!de) { ++ if (data) ++ dlen = (*data) + 1; + err = ext4_find_dest_de(dir, inode, bh, bh->b_data, +- blocksize - csum_size, fname, &de); ++ blocksize - csum_size, fname, &de, &dlen); + if (err) + return err; + } +@@ -2198,7 +2249,10 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, + } + + /* By now the buffer is marked for journaling */ +- ext4_insert_dentry(dir, inode, de, blocksize, fname); ++ /* If writing the short form of "dotdot", don't add the data section */ ++ if (dlen == 1) ++ data = NULL; ++ ext4_insert_dentry(dir, inode, de, blocksize, fname, data); + + /* + * XXX shouldn't update any times until successful +@@ -2296,7 +2350,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + blocksize); + + /* initialize hashing info */ +- dx_info = dx_get_dx_info(dot_de); ++ dx_info = dx_get_dx_info(dot_de, dir); + memset(dx_info, 0, sizeof(*dx_info)); + dx_info->info_length = sizeof(*dx_info); + if (ext4_hash_in_dirent(dir)) +@@ -2307,7 +2361,8 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + entries = (void *)dx_info + sizeof(*dx_info); + dx_set_block(entries, 1); + dx_set_count(entries, 1); +- dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info))); ++ dx_set_limit(entries, dx_root_limit(dir, ++ dot_de, sizeof(*dx_info))); + + /* Initialize as for dx_probe */ + fname->hinfo.hash_version = dx_info->hash_version; +@@ -2348,7 +2403,7 @@ out_frames: + */ + if (retval) + ext4_mark_inode_dirty(handle, dir); +- dx_release(frames); ++ dx_release(frames, dir); + brelse(bh2); + return retval; + } +@@ -2361,6 +2416,8 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, + struct buffer_head *dir_block; + struct ext4_dir_entry_2 *de; + int len, journal = 0, err = 0; ++ int dlen = 0; ++ char *data; + + if (IS_ERR(handle)) + return PTR_ERR(handle); +@@ -2376,21 +2433,25 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, + + de = (struct ext4_dir_entry_2 *)dir_block->b_data; + /* the first item must be "." */ +- assert(de->name_len == 1 && de->name[0] == '.'); ++ ASSERT(de->name_len == 1 && de->name[0] == '.'); + len = le16_to_cpu(de->rec_len); +- assert(len >= EXT4_DIR_REC_LEN(1)); +- if (len > EXT4_DIR_REC_LEN(1)) { ++ ASSERT(len >= EXT4_DIR_REC_LEN(1, dir)); ++ if (len > EXT4_DIR_REC_LEN(1, dir)) { + BUFFER_TRACE(dir_block, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir->i_sb, dir_block, EXT4_JTR_NONE); + if (err) + goto out_journal; + + journal = 1; +- de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(1)); ++ de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(1, dir)); + } + +- len -= EXT4_DIR_REC_LEN(1); +- assert(len == 0 || len >= EXT4_DIR_REC_LEN(2)); ++ len -= EXT4_DIR_REC_LEN(1, dir); ++ data = ext4_dentry_get_data(dir->i_sb, ++ (struct ext4_dentry_param *)dentry->d_fsdata); ++ if (data) ++ dlen = *data + 1; ++ ASSERT(len == 0 || len >= EXT4_DIR_REC_LEN(2 + dlen, dir)); + de = (struct ext4_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + if (!journal) { +@@ -2404,10 +2465,15 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, + if (len > 0) + de->rec_len = cpu_to_le16(len); + else +- assert(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2)); ++ ASSERT(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2, dir)); + de->name_len = 2; + strcpy(de->name, ".."); +- ext4_set_de_type(dir->i_sb, de, S_IFDIR); ++ if (data != NULL && ext4_get_dirent_data_len(de) >= dlen) { ++ de->name[2] = 0; ++ memcpy(&de->name[2 + 1], data, *data); ++ ext4_set_de_type(dir->i_sb, de, S_IFDIR); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } + + out_journal: + if (journal) { +@@ -2445,6 +2511,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, + ext4_lblk_t block, blocks; + int csum_size = 0; + ++ EXT4_I(inode)->i_dirdata = dentry->d_fsdata; + if (ext4_has_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + +@@ -2687,7 +2754,7 @@ again: + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); + info = dx_get_dx_info((struct ext4_dir_entry_2 *) +- frames[0].bh->b_data); ++ frames[0].bh->b_data, dir); + info->indirect_levels = 1; + dxtrace(printk(KERN_DEBUG + "Creating %d level index...\n", +@@ -2713,7 +2780,7 @@ journal_error: + ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ + cleanup: + brelse(bh); +- dx_release(frames); ++ dx_release(frames, dir); + /* @restart is true means htree-path has been changed, we need to + * repeat dx_probe() to find out valid htree-path + */ +@@ -3016,38 +3083,73 @@ err_unlock_inode: + return err; + } + ++struct tp_block { ++ struct inode *inode; ++ void *data1; ++ void *data2; ++}; ++ + struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len) + { ++ void *data1 = NULL, *data2 = NULL; ++ int dot_reclen = 0; ++ ++ if (dotdot_real_len == 10) { ++ struct tp_block *tpb = (struct tp_block *)inode; ++ data1 = tpb->data1; ++ data2 = tpb->data2; ++ inode = tpb->inode; ++ dotdot_real_len = 0; ++ } + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; +- de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL), +- blocksize); + strcpy(de->name, "."); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + ++ /* get packed fid data*/ ++ data1 = ext4_dentry_get_data(inode->i_sb, ++ (struct ext4_dentry_param *) data1); ++ if (data1) { ++ de->name[1] = 0; ++ memcpy(&de->name[2], data1, *(char *) data1); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } ++ de->rec_len = cpu_to_le16(EXT4_DIR_ENTRY_LEN(de, NULL)); ++ ++ dot_reclen = cpu_to_le16(de->rec_len); + de = ext4_next_entry(de, blocksize); + de->inode = cpu_to_le32(parent_ino); + de->name_len = 2; ++ ++ strcpy(de->name, ".."); ++ ext4_set_de_type(inode->i_sb, de, S_IFDIR); ++ data2 = ext4_dentry_get_data(inode->i_sb, ++ (struct ext4_dentry_param *) data2); ++ if (data2) { ++ de->name[2] = 0; ++ memcpy(&de->name[3], data2, *(char *) data2); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } ++ + if (!dotdot_real_len) + de->rec_len = ext4_rec_len_to_disk(blocksize - +- (csum_size + ext4_dir_rec_len(1, NULL)), +- blocksize); ++ (csum_size + dot_reclen), blocksize); + else + de->rec_len = ext4_rec_len_to_disk( +- ext4_dir_rec_len(de->name_len, NULL), ++ EXT4_DIR_ENTRY_LEN(de, NULL), + blocksize); +- strcpy(de->name, ".."); +- ext4_set_de_type(inode->i_sb, de, S_IFDIR); + + return ext4_next_entry(de, blocksize); + } + + int ext4_init_new_dir(handle_t *handle, struct inode *dir, +- struct inode *inode) ++ struct inode *inode, ++ const void *data1, const void *data2) + { ++ struct tp_block param; + struct buffer_head *dir_block = NULL; + struct ext4_dir_entry_2 *de; + ext4_lblk_t block = 0; +@@ -3071,7 +3173,11 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, + if (IS_ERR(dir_block)) + return PTR_ERR(dir_block); + de = (struct ext4_dir_entry_2 *)dir_block->b_data; +- ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); ++ param.inode = inode; ++ param.data1 = (void *)data1; ++ param.data2 = (void *)data2; ++ ext4_init_dot_dotdot((struct inode *)(¶m), de, blocksize, ++ csum_size, dir->i_ino, 10); + set_nlink(inode, 2); + if (csum_size) + ext4_initialize_dirent_tail(dir_block, blocksize); +@@ -3086,6 +3192,29 @@ out: + return err; + } + ++/* Initialize @inode as a subdirectory of @dir, and add the ++ * "." and ".." entries into the first directory block. */ ++int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, ++ struct inode *inode, ++ const void *data1, const void *data2) ++{ ++ int rc; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_DIRSYNC(dir)) ++ ext4_handle_sync(handle); ++ ++ inode->i_op = &ext4_dir_inode_operations; ++ inode->i_fop = &ext4_dir_operations; ++ rc = ext4_init_new_dir(handle, dir, inode, data1, data2); ++ if (!rc) ++ rc = ext4_mark_inode_dirty(handle, inode); ++ return rc; ++} ++EXPORT_SYMBOL(ext4_add_dot_dotdot); ++ + static int ext4_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) + { +@@ -3113,7 +3242,7 @@ retry: + + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; +- err = ext4_init_new_dir(handle, dir, inode); ++ err = ext4_init_new_dir(handle, dir, inode, NULL, NULL); + if (err) + goto out_clear_inode; + err = ext4_mark_inode_dirty(handle, inode); +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 4be1994..a2fcbf8 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1672,7 +1672,7 @@ enum { + Opt_inlinecrypt, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, +- Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, ++ Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_dirdata, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, + Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, +@@ -1756,6 +1756,7 @@ static const match_table_t tokens = { + {Opt_nolazytime, "nolazytime"}, + {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"}, + {Opt_nodelalloc, "nodelalloc"}, ++ {Opt_dirdata, "dirdata"}, + {Opt_removed, "mblk_io_submit"}, + {Opt_removed, "nomblk_io_submit"}, + {Opt_block_validity, "block_validity"}, +@@ -2000,6 +2001,7 @@ static const struct mount_opts { + {Opt_usrjquota, 0, MOPT_Q | MOPT_STRING}, + {Opt_grpjquota, 0, MOPT_Q | MOPT_STRING}, + {Opt_offusrjquota, 0, MOPT_Q}, ++ {Opt_dirdata, EXT4_MOUNT_DIRDATA, MOPT_SET}, + {Opt_offgrpjquota, 0, MOPT_Q}, + {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, + {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, +-- +2.34.1 + --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles15sp4/ext4-hash-indexed-dir-dotdot-update.patch @@ -0,0 +1,98 @@ +Subject: [PATCH] linux-5.14/ext4-hash-indexed-dir-dotdot-update.patch + +--- + fs/ext4/namei.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 71 insertions(+) + +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index 95b21f5..e4514c9 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -2301,6 +2301,74 @@ out_frames: + return retval; + } + ++/* update ".." for hash-indexed directory, split the item "." if necessary */ ++static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ struct inode *dir = dentry->d_parent->d_inode; ++ struct buffer_head *dir_block; ++ struct ext4_dir_entry_2 *de; ++ int len, journal = 0, err = 0; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_DIRSYNC(dir)) ++ handle->h_sync = 1; ++ ++ dir_block = ext4_bread(handle, dir, 0, 0); ++ if (IS_ERR(dir_block)) { ++ err = PTR_ERR(dir_block); ++ goto out; ++ } ++ ++ de = (struct ext4_dir_entry_2 *)dir_block->b_data; ++ /* the first item must be "." */ ++ assert(de->name_len == 1 && de->name[0] == '.'); ++ len = le16_to_cpu(de->rec_len); ++ assert(len >= EXT4_DIR_REC_LEN(1)); ++ if (len > EXT4_DIR_REC_LEN(1)) { ++ BUFFER_TRACE(dir_block, "get_write_access"); ++ err = ext4_journal_get_write_access(handle, dir->i_sb, dir_block, EXT4_JTR_NONE); ++ if (err) ++ goto out_journal; ++ ++ journal = 1; ++ de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(1)); ++ } ++ ++ len -= EXT4_DIR_REC_LEN(1); ++ assert(len == 0 || len >= EXT4_DIR_REC_LEN(2)); ++ de = (struct ext4_dir_entry_2 *) ++ ((char *) de + le16_to_cpu(de->rec_len)); ++ if (!journal) { ++ BUFFER_TRACE(dir_block, "get_write_access"); ++ err = ext4_journal_get_write_access(handle, dir->i_sb, dir_block, EXT4_JTR_NONE); ++ if (err) ++ goto out_journal; ++ } ++ ++ de->inode = cpu_to_le32(inode->i_ino); ++ if (len > 0) ++ de->rec_len = cpu_to_le16(len); ++ else ++ assert(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2)); ++ de->name_len = 2; ++ strcpy(de->name, ".."); ++ ext4_set_de_type(dir->i_sb, de, S_IFDIR); ++ ++out_journal: ++ if (journal) { ++ BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); ++ err = ext4_handle_dirty_dirblock(handle, dir, dir_block); ++ ext4_mark_inode_dirty(handle, dir); ++ } ++ brelse(dir_block); ++ ++out: ++ return err; ++} ++ + /* + * ext4_add_entry() + * +@@ -2357,6 +2425,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, + } + + if (is_dx(dir)) { ++ if (dentry->d_name.len == 2 && ++ memcmp(dentry->d_name.name, "..", 2) == 0) ++ return ext4_update_dotdot(handle, dentry, inode); + retval = ext4_dx_add_entry(handle, &fname, dir, inode); + if (!retval || (retval != ERR_BAD_DX_DIR)) + goto out; +-- +2.31.1 + --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles15sp4/ext4-misc.patch @@ -0,0 +1,226 @@ +Subject: [PATCH] ext4-misc + +--- + fs/ext4/ext4.h | 25 ++++++++++++++++++++++++- + fs/ext4/ialloc.c | 3 ++- + fs/ext4/inode.c | 16 ++++++++++++++++ + fs/ext4/namei.c | 9 ++++++--- + fs/ext4/super.c | 10 ++-------- + fs/ext4/xattr.c | 2 ++ + 6 files changed, 52 insertions(+), 13 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 8d2c493..c96d89f 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1913,6 +1913,8 @@ static inline bool ext4_verity_in_progress(struct inode *inode) + + #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + ++#define JOURNAL_START_HAS_3ARGS 1 ++ + /* + * Codes for operating systems + */ +@@ -2152,7 +2154,21 @@ static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_bl + + EXTN_FEATURE_FUNCS(2) + EXTN_FEATURE_FUNCS(3) +-EXTN_FEATURE_FUNCS(4) ++static inline bool ext4_has_unknown_ext4_compat_features(struct super_block *sb) ++{ ++ return ((EXT4_SB(sb)->s_es->s_feature_compat & ++ cpu_to_le32(~EXT4_FEATURE_COMPAT_SUPP)) != 0); ++} ++static inline bool ext4_has_unknown_ext4_ro_compat_features(struct super_block *sb) ++{ ++ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & ++ cpu_to_le32(~EXT4_FEATURE_RO_COMPAT_SUPP)) != 0); ++} ++static inline bool ext4_has_unknown_ext4_incompat_features(struct super_block *sb) ++{ ++ return ((EXT4_SB(sb)->s_es->s_feature_incompat & ++ cpu_to_le32(~EXT4_FEATURE_INCOMPAT_SUPP)) != 0); ++} + + static inline bool ext4_has_compat_features(struct super_block *sb) + { +@@ -3687,6 +3703,13 @@ struct ext4_extent; + #define EXT_MAX_BLOCKS 0xffffffff + + extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode); ++extern struct buffer_head *ext4_read_inode_bitmap(struct super_block *sb, ++ ext4_group_t block_group); ++extern void ext4_inc_count(struct inode *inode); ++extern void ext4_dec_count(struct inode *inode); ++extern struct buffer_head *ext4_append(handle_t *handle, ++ struct inode *inode, ++ ext4_lblk_t *block); + extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); + extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index 5d0a11d..4840190 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -120,7 +120,7 @@ verified: + * + * Return buffer_head of bitmap on success, or an ERR_PTR on error. + */ +-static struct buffer_head * ++struct buffer_head * + ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) + { + struct ext4_group_desc *desc; +@@ -215,6 +215,7 @@ out: + put_bh(bh); + return ERR_PTR(err); + } ++EXPORT_SYMBOL(ext4_read_inode_bitmap); + + /* + * NOTE! When we get the inode, we're the only people +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 06ee22e..bb109d9 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -6204,3 +6204,19 @@ out_error: + ext4_journal_stop(handle); + goto out; + } ++EXPORT_SYMBOL(ext4_map_blocks); ++EXPORT_SYMBOL(ext4_truncate); ++EXPORT_SYMBOL(ext4_iget); ++EXPORT_SYMBOL(ext4_bread); ++EXPORT_SYMBOL(ext4_itable_unused_count); ++EXPORT_SYMBOL(ext4_force_commit); ++EXPORT_SYMBOL(__ext4_mark_inode_dirty); ++EXPORT_SYMBOL(ext4_get_group_desc); ++EXPORT_SYMBOL(__ext4_journal_get_write_access); ++EXPORT_SYMBOL(__ext4_journal_start_sb); ++EXPORT_SYMBOL(__ext4_journal_stop); ++EXPORT_SYMBOL(__ext4_handle_dirty_metadata); ++EXPORT_SYMBOL(__ext4_std_error); ++EXPORT_SYMBOL(ext4fs_dirhash); ++EXPORT_SYMBOL(ext4_get_inode_loc); ++EXPORT_SYMBOL(__ext4_journal_ensure_credits); +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index 0d5b8ea..f207dd5 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -50,7 +50,7 @@ + #define NAMEI_RA_BLOCKS 4 + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + +-static struct buffer_head *ext4_append(handle_t *handle, ++struct buffer_head *ext4_append(handle_t *handle, + struct inode *inode, + ext4_lblk_t *block) + { +@@ -205,6 +205,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, + } + return bh; + } ++EXPORT_SYMBOL(ext4_append); + + #ifdef DX_DEBUG + #define dxtrace(command) command +@@ -2753,23 +2754,25 @@ EXPORT_SYMBOL(ext4_delete_entry); + * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set + * on regular files) and to avoid creating huge/slow non-HTREE directories. + */ +-static void ext4_inc_count(struct inode *inode) ++void ext4_inc_count(struct inode *inode) + { + inc_nlink(inode); + if (is_dx(inode) && + (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2)) + set_nlink(inode, 1); + } ++EXPORT_SYMBOL(ext4_inc_count); + + /* + * If a directory had nlink == 1, then we should let it be 1. This indicates + * directory has >EXT4_LINK_MAX subdirs. + */ +-static void ext4_dec_count(struct inode *inode) ++void ext4_dec_count(struct inode *inode) + { + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) + drop_nlink(inode); + } ++EXPORT_SYMBOL(ext4_dec_count); + + + /* +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 7b5df25..4be1994 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -5605,7 +5605,7 @@ static void ext4_update_super(struct super_block *sb) + __ext4_update_tstamp(&es->s_first_error_time, + &es->s_first_error_time_hi, + sbi->s_first_error_time); +- strncpy(es->s_first_error_func, sbi->s_first_error_func, ++ strlcpy(es->s_first_error_func, sbi->s_first_error_func, + sizeof(es->s_first_error_func)); + es->s_first_error_line = + cpu_to_le32(sbi->s_first_error_line); +@@ -5619,7 +5619,7 @@ static void ext4_update_super(struct super_block *sb) + __ext4_update_tstamp(&es->s_last_error_time, + &es->s_last_error_time_hi, + sbi->s_last_error_time); +- strncpy(es->s_last_error_func, sbi->s_last_error_func, ++ strlcpy(es->s_last_error_func, sbi->s_last_error_func, + sizeof(es->s_last_error_func)); + es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); + es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); +@@ -6826,16 +6826,12 @@ static int __init ext4_init_fs(void) + if (err) + goto out05; + +- register_as_ext3(); +- register_as_ext2(); + err = register_filesystem(&ext4_fs_type); + if (err) + goto out; + + return 0; + out: +- unregister_as_ext2(); +- unregister_as_ext3(); + ext4_fc_destroy_dentry_cache(); + out05: + destroy_inodecache(); +@@ -6860,8 +6856,6 @@ out7: + static void __exit ext4_exit_fs(void) + { + ext4_destroy_lazyinit_thread(); +- unregister_as_ext2(); +- unregister_as_ext3(); + unregister_filesystem(&ext4_fs_type); + ext4_fc_destroy_dentry_cache(); + destroy_inodecache(); +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index f9b4602..f5af950 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -671,6 +671,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name, + up_read(&EXT4_I(inode)->xattr_sem); + return error; + } ++EXPORT_SYMBOL(ext4_xattr_get); + + static int + ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, +@@ -2430,6 +2431,7 @@ cleanup: + ext4_write_unlock_xattr(inode, &no_expand); + return error; + } ++EXPORT_SYMBOL(ext4_xattr_set_handle); + + int ext4_xattr_set_credits(struct inode *inode, size_t value_len, + bool is_create, int *credits) +-- +2.34.1 + --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles15sp4/ext4-pdirop.patch @@ -0,0 +1,931 @@ +Subject: [PATCH] sles15sp4/ext4-pdirop.patch + +Single directory performance is a critical for HPC workloads. In a +typical use case an application creates a separate output file for +each node and task in a job. As nodes and tasks increase, hundreds +of thousands of files may be created in a single directory within +a short window of time. +Today, both filename lookup and file system modifying operations +(such as create and unlink) are protected with a single lock for +an entire ldiskfs directory. PDO project will remove this +bottleneck by introducing a parallel locking mechanism for entire +ldiskfs directories. This work will enable multiple application +threads to simultaneously lookup, create and unlink in parallel. + +This patch contains: + - pdirops support for ldiskfs + - integrate with osd-ldiskfs + +--- + fs/ext4/Makefile | 1 + + fs/ext4/ext4.h | 78 ++++++++ + fs/ext4/namei.c | 465 ++++++++++++++++++++++++++++++++++++++++++----- + fs/ext4/super.c | 1 + + 4 files changed, 504 insertions(+), 41 deletions(-) + +diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile +index 49e7af6..f7ced03 100644 +--- a/fs/ext4/Makefile ++++ b/fs/ext4/Makefile +@@ -7,6 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o + + ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ + extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \ ++ htree_lock.o \ + indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ + mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ + super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \ +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 54734be..fa5d5d6 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -28,6 +28,7 @@ + #include <linux/mutex.h> + #include <linux/timer.h> + #include <linux/wait.h> ++#include <linux/htree_lock.h> + #include <linux/sched/signal.h> + #include <linux/blockgroup_lock.h> + #include <linux/percpu_counter.h> +@@ -1020,6 +1021,9 @@ struct ext4_inode_info { + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + ++ /* following fields for parallel directory operations -bzzz */ ++ struct semaphore i_append_sem; ++ + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, +@@ -2509,6 +2513,72 @@ struct dx_hash_info + */ + #define HASH_NB_ALWAYS 1 + ++/* assume name-hash is protected by upper layer */ ++#define EXT4_HTREE_LOCK_HASH 0 ++ ++enum ext4_pdo_lk_types { ++#if EXT4_HTREE_LOCK_HASH ++ EXT4_LK_HASH, ++#endif ++ EXT4_LK_DX, /* index block */ ++ EXT4_LK_DE, /* directory entry block */ ++ EXT4_LK_SPIN, /* spinlock */ ++ EXT4_LK_MAX, ++}; ++ ++/* read-only bit */ ++#define EXT4_LB_RO(b) (1 << (b)) ++/* read + write, high bits for writer */ ++#define EXT4_LB_RW(b) ((1 << (b)) | (1 << (EXT4_LK_MAX + (b)))) ++ ++enum ext4_pdo_lock_bits { ++ /* DX lock bits */ ++ EXT4_LB_DX_RO = EXT4_LB_RO(EXT4_LK_DX), ++ EXT4_LB_DX = EXT4_LB_RW(EXT4_LK_DX), ++ /* DE lock bits */ ++ EXT4_LB_DE_RO = EXT4_LB_RO(EXT4_LK_DE), ++ EXT4_LB_DE = EXT4_LB_RW(EXT4_LK_DE), ++ /* DX spinlock bits */ ++ EXT4_LB_SPIN_RO = EXT4_LB_RO(EXT4_LK_SPIN), ++ EXT4_LB_SPIN = EXT4_LB_RW(EXT4_LK_SPIN), ++ /* accurate searching */ ++ EXT4_LB_EXACT = EXT4_LB_RO(EXT4_LK_MAX << 1), ++}; ++ ++enum ext4_pdo_lock_opc { ++ /* external */ ++ EXT4_HLOCK_READDIR = (EXT4_LB_DE_RO | EXT4_LB_DX_RO), ++ EXT4_HLOCK_LOOKUP = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL = (EXT4_LB_DE | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_ADD = (EXT4_LB_DE | EXT4_LB_SPIN_RO), ++ ++ /* internal */ ++ EXT4_HLOCK_LOOKUP_SAFE = (EXT4_LB_DE_RO | EXT4_LB_DX_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL_SAFE = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT), ++ EXT4_HLOCK_SPLIT = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN), ++}; ++ ++extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits); ++#define ext4_htree_lock_head_free(lhead) htree_lock_head_free(lhead) ++ ++extern struct htree_lock *ext4_htree_lock_alloc(void); ++#define ext4_htree_lock_free(lck) htree_lock_free(lck) ++ ++extern void ext4_htree_lock(struct htree_lock *lck, ++ struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags); ++#define ext4_htree_unlock(lck) htree_unlock(lck) ++ ++extern struct buffer_head *ext4_find_entry_locked(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 **res_dir, ++ int *inlined, struct htree_lock *lck); ++extern int ext4_add_entry_locked(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck); ++ + struct ext4_filename { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; +@@ -2887,12 +2957,20 @@ void ext4_insert_dentry(struct inode *dir, struct inode *inode, + void *data); + static inline void ext4_update_dx_flag(struct inode *inode) + { ++ /* Disable it for ldiskfs, because going from a DX directory to ++ * a non-DX directory while it is in use will completely break ++ * the htree-locking. ++ * If we really want to support this operation in the future, ++ * we need to exclusively lock the directory at here which will ++ * increase complexity of code */ ++#if 0 + if (!ext4_has_feature_dir_index(inode->i_sb) && + ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { + /* ext4_iget() should have caught this... */ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } ++#endif + } + static const unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index 51c950b..1b8c80e 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -56,6 +56,7 @@ struct buffer_head *ext4_append(handle_t *handle, + { + struct ext4_map_blocks map; + struct buffer_head *bh; ++ struct ext4_inode_info *ei = EXT4_I(inode); + int err; + + if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && +@@ -63,6 +64,10 @@ struct buffer_head *ext4_append(handle_t *handle, + EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) + return ERR_PTR(-ENOSPC); + ++ /* with parallel dir operations all appends ++ * have to be serialized -bzzz */ ++ down(&ei->i_append_sem); ++ + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + map.m_lblk = *block; + map.m_len = 1; +@@ -73,21 +78,27 @@ struct buffer_head *ext4_append(handle_t *handle, + * directory. + */ + err = ext4_map_blocks(NULL, inode, &map, 0); +- if (err < 0) ++ if (err < 0) { ++ up(&ei->i_append_sem); + return ERR_PTR(err); ++ } + if (err) { ++ up(&ei->i_append_sem); + EXT4_ERROR_INODE(inode, "Logical block already allocated"); + return ERR_PTR(-EFSCORRUPTED); + } + + bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); +- if (IS_ERR(bh)) ++ if (IS_ERR(bh)) { ++ up(&ei->i_append_sem); + return bh; ++ } + inode->i_size += inode->i_sb->s_blocksize; + EXT4_I(inode)->i_disksize = inode->i_size; + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); ++ up(&ei->i_append_sem); + if (err) { + brelse(bh); + ext4_std_error(inode->i_sb, err); +@@ -291,7 +302,8 @@ static unsigned dx_node_limit(struct inode *dir); + static struct dx_frame *dx_probe(struct ext4_filename *fname, + struct inode *dir, + struct dx_hash_info *hinfo, +- struct dx_frame *frame); ++ struct dx_frame *frame, ++ struct htree_lock *lck); + static void dx_release(struct dx_frame *frames, struct inode *dir); + static int dx_make_map(struct inode *dir, struct buffer_head *bh, + struct dx_hash_info *hinfo, +@@ -307,12 +319,13 @@ static void dx_insert_block(struct dx_frame *frame, + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash); ++ __u32 *start_hash, struct htree_lock *lck); + static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **res_dir); ++ struct ext4_dir_entry_2 **res_dir, struct htree_lock *lck); + static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, +- struct inode *dir, struct inode *inode); ++ struct inode *dir, struct inode *inode, ++ struct htree_lock *lck); + + /* checksumming functions */ + void ext4_initialize_dirent_tail(struct buffer_head *bh, +@@ -797,6 +810,227 @@ static inline void htree_rep_invariant_check(struct dx_entry *at, + } + #endif /* DX_DEBUG */ + ++/* private data for htree_lock */ ++struct ext4_dir_lock_data { ++ unsigned ld_flags; /* bits-map for lock types */ ++ unsigned ld_count; /* # entries of the last DX block */ ++ struct dx_entry ld_at_entry; /* copy of leaf dx_entry */ ++ struct dx_entry *ld_at; /* position of leaf dx_entry */ ++}; ++ ++#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private) ++#define ext4_find_entry(dir, name, dirent, inline) \ ++ ext4_find_entry_locked(dir, name, dirent, inline, NULL) ++#define ext4_add_entry(handle, dentry, inode) \ ++ ext4_add_entry_locked(handle, dentry, inode, NULL) ++ ++/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */ ++#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32) ++ ++static void ext4_htree_event_cb(void *target, void *event) ++{ ++ u64 *block = (u64 *)target; ++ ++ if (*block == dx_get_block((struct dx_entry *)event)) ++ *block = EXT4_HTREE_NODE_CHANGED; ++} ++ ++struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits) ++{ ++ struct htree_lock_head *lhead; ++ ++ lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0); ++ if (lhead != NULL) { ++ htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR, ++ ext4_htree_event_cb); ++ } ++ return lhead; ++} ++EXPORT_SYMBOL(ext4_htree_lock_head_alloc); ++ ++struct htree_lock *ext4_htree_lock_alloc(void) ++{ ++ return htree_lock_alloc(EXT4_LK_MAX, ++ sizeof(struct ext4_dir_lock_data)); ++} ++EXPORT_SYMBOL(ext4_htree_lock_alloc); ++ ++static htree_lock_mode_t ext4_htree_mode(unsigned flags) ++{ ++ switch (flags) { ++ default: /* 0 or unknown flags require EX lock */ ++ return HTREE_LOCK_EX; ++ case EXT4_HLOCK_READDIR: ++ return HTREE_LOCK_PR; ++ case EXT4_HLOCK_LOOKUP: ++ return HTREE_LOCK_CR; ++ case EXT4_HLOCK_DEL: ++ case EXT4_HLOCK_ADD: ++ return HTREE_LOCK_CW; ++ } ++} ++ ++/* return PR for read-only operations, otherwise return EX */ ++static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags) ++{ ++ int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE; ++ ++ /* 0 requires EX lock */ ++ return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR; ++} ++ ++static int ext4_htree_safe_locked(struct htree_lock *lck) ++{ ++ int writer; ++ ++ if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX) ++ return 1; ++ ++ writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) == ++ EXT4_LB_DE; ++ if (writer) /* all readers & writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_EX; ++ ++ /* all writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_PR || ++ lck->lk_mode == HTREE_LOCK_PW || ++ lck->lk_mode == HTREE_LOCK_EX; ++} ++ ++/* relock htree_lock with EX mode if it's change operation, otherwise ++ * relock it with PR mode. It's noop if PDO is disabled. */ ++static void ext4_htree_safe_relock(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck)) { ++ unsigned flags = ext4_htree_lock_data(lck)->ld_flags; ++ ++ htree_change_lock(lck, ext4_htree_safe_mode(flags)); ++ } ++} ++ ++void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags) ++{ ++ htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) : ++ ext4_htree_safe_mode(flags); ++ ++ ext4_htree_lock_data(lck)->ld_flags = flags; ++ htree_lock(lck, lhead, mode); ++ if (!is_dx(dir)) ++ ext4_htree_safe_relock(lck); /* make sure it's safe locked */ ++} ++EXPORT_SYMBOL(ext4_htree_lock); ++ ++static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at, ++ unsigned lmask, int wait, void *ev) ++{ ++ u32 key = (at == NULL) ? 0 : dx_get_block(at); ++ u32 mode; ++ ++ /* NOOP if htree is well protected or caller doesn't require the lock */ ++ if (ext4_htree_safe_locked(lck) || ++ !(ext4_htree_lock_data(lck)->ld_flags & lmask)) ++ return 1; ++ ++ mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ? ++ HTREE_LOCK_PW : HTREE_LOCK_PR; ++ while (1) { ++ if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev)) ++ return 1; ++ if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */ ++ return 0; ++ cpu_relax(); /* spin until granted */ ++ } ++} ++ ++static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask) ++{ ++ return ext4_htree_safe_locked(lck) || ++ htree_node_is_granted(lck, ffz(~lmask)); ++} ++ ++static void ext4_htree_node_unlock(struct htree_lock *lck, ++ unsigned lmask, void *buf) ++{ ++ /* NB: it's safe to call mutiple times or even it's not locked */ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_granted(lck, ffz(~lmask))) ++ htree_node_unlock(lck, ffz(~lmask), buf); ++} ++ ++#define ext4_htree_dx_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL) ++#define ext4_htree_dx_lock_try(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL) ++#define ext4_htree_dx_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL) ++#define ext4_htree_dx_locked(lck) \ ++ ext4_htree_node_locked(lck, EXT4_LB_DX) ++ ++static void ext4_htree_dx_need_lock(struct htree_lock *lck) ++{ ++ struct ext4_dir_lock_data *ld; ++ ++ if (ext4_htree_safe_locked(lck)) ++ return; ++ ++ ld = ext4_htree_lock_data(lck); ++ switch (ld->ld_flags) { ++ default: ++ return; ++ case EXT4_HLOCK_LOOKUP: ++ ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE; ++ return; ++ case EXT4_HLOCK_DEL: ++ ld->ld_flags = EXT4_HLOCK_DEL_SAFE; ++ return; ++ case EXT4_HLOCK_ADD: ++ ld->ld_flags = EXT4_HLOCK_SPLIT; ++ return; ++ } ++} ++ ++#define ext4_htree_de_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL) ++#define ext4_htree_de_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL) ++ ++#define ext4_htree_spin_lock(lck, key, event) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event) ++#define ext4_htree_spin_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL) ++#define ext4_htree_spin_unlock_listen(lck, p) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p) ++ ++static void ext4_htree_spin_stop_listen(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN))) ++ htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN)); ++} ++ ++enum { ++ DX_HASH_COL_IGNORE, /* ignore collision while probing frames */ ++ DX_HASH_COL_YES, /* there is collision and it does matter */ ++ DX_HASH_COL_NO, /* there is no collision */ ++}; ++ ++static int dx_probe_hash_collision(struct htree_lock *lck, ++ struct dx_entry *entries, ++ struct dx_entry *at, u32 hash) ++{ ++ if (!(lck && ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) { ++ return DX_HASH_COL_IGNORE; /* don't care about collision */ ++ ++ } else if (at == entries + dx_get_count(entries) - 1) { ++ return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */ ++ ++ } else { /* hash collision? */ ++ return ((dx_get_hash(at + 1) & ~1) == hash) ? ++ DX_HASH_COL_YES : DX_HASH_COL_NO; ++ } ++} ++ + /* + * Probe for a directory leaf block to search. + * +@@ -808,10 +1042,11 @@ static inline void htree_rep_invariant_check(struct dx_entry *at, + */ + static struct dx_frame * + dx_probe(struct ext4_filename *fname, struct inode *dir, +- struct dx_hash_info *hinfo, struct dx_frame *frame_in) ++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, ++ struct htree_lock *lck) + { + unsigned count, indirect, level, i; +- struct dx_entry *at, *entries, *p, *q, *m; ++ struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL; + struct dx_root_info *info; + struct dx_frame *frame = frame_in; + struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); +@@ -895,8 +1130,16 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + level = 0; + blocks[0] = 0; + while (1) { ++ if (indirect == level) { /* the last index level */ ++ /* NB: ext4_htree_dx_lock() could be noop if ++ * DX-lock flag is not set for current operation ++ */ ++ ext4_htree_dx_lock(lck, dx); ++ ext4_htree_spin_lock(lck, dx, NULL); ++ } + count = dx_get_count(entries); + if (!count || count > dx_get_limit(entries)) { ++ ext4_htree_spin_unlock(lck); /* release spin */ + ext4_warning_inode(dir, + "dx entry: count %u beyond limit %u", + count, dx_get_limit(entries)); +@@ -923,6 +1166,74 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + frame->entries = entries; + frame->at = at; + ++ if (indirect == level) { /* the last index level */ ++ struct ext4_dir_lock_data *ld; ++ u64 myblock; ++ ++ /* By default we only lock DE-block, however, we will ++ * also lock the last level DX-block if: ++ * a) there is hash collision ++ * we will set DX-lock flag (a few lines below) ++ * and redo to lock DX-block ++ * see detail in dx_probe_hash_collision() ++ * b) it's a retry from splitting ++ * we need to lock the last level DX-block so nobody ++ * else can split any leaf blocks under the same ++ * DX-block, see detail in ext4_dx_add_entry() ++ */ ++ if (ext4_htree_dx_locked(lck)) { ++ /* DX-block is locked, just lock DE-block ++ * and return ++ */ ++ ext4_htree_spin_unlock(lck); ++ if (!ext4_htree_safe_locked(lck)) ++ ext4_htree_de_lock(lck, frame->at); ++ return frame; ++ } ++ /* it's pdirop and no DX lock */ ++ if (dx_probe_hash_collision(lck, entries, at, hash) == ++ DX_HASH_COL_YES) { ++ /* found hash collision, set DX-lock flag ++ * and retry to abtain DX-lock ++ */ ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_need_lock(lck); ++ continue; ++ } ++ ld = ext4_htree_lock_data(lck); ++ /* because I don't lock DX, so @at can't be trusted ++ * after I release spinlock so I have to save it ++ */ ++ ld->ld_at = at; ++ ld->ld_at_entry = *at; ++ ld->ld_count = dx_get_count(entries); ++ ++ frame->at = &ld->ld_at_entry; ++ myblock = dx_get_block(at); ++ ++ /* NB: ordering locking */ ++ ext4_htree_spin_unlock_listen(lck, &myblock); ++ /* other thread can split this DE-block because: ++ * a) I don't have lock for the DE-block yet ++ * b) I released spinlock on DX-block ++ * if it happened I can detect it by listening ++ * splitting event on this DE-block ++ */ ++ ext4_htree_de_lock(lck, frame->at); ++ ext4_htree_spin_stop_listen(lck); ++ ++ if (myblock == EXT4_HTREE_NODE_CHANGED) { ++ /* someone split this DE-block before ++ * I locked it, I need to retry and lock ++ * valid DE-block ++ */ ++ ext4_htree_de_unlock(lck); ++ continue; ++ } ++ return frame; ++ } ++ dx = at; ++ + block = dx_get_block(at); + for (i = 0; i <= level; i++) { + if (blocks[i] == block) { +@@ -932,8 +1243,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + goto fail; + } + } +- if (++level > indirect) +- return frame; ++ ++level; + blocks[level] = block; + frame++; + frame->bh = ext4_read_dirblock(dir, block, INDEX); +@@ -1004,7 +1314,7 @@ static void dx_release(struct dx_frame *frames, struct inode *dir) + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash) ++ __u32 *start_hash, struct htree_lock *lck) + { + struct dx_frame *p; + struct buffer_head *bh; +@@ -1019,12 +1329,22 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ ++ ext4_htree_de_unlock(lck); + while (1) { +- if (++(p->at) < p->entries + dx_get_count(p->entries)) +- break; ++ if (num_frames > 0 || ext4_htree_dx_locked(lck)) { ++ /* num_frames > 0 : ++ * DX block ++ * ext4_htree_dx_locked: ++ * frame->at is reliable pointer returned by dx_probe, ++ * otherwise dx_probe already knew no collision */ ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ break; ++ } + if (p == frames) + return 0; + num_frames++; ++ if (num_frames == 1) ++ ext4_htree_dx_unlock(lck); + p--; + } + +@@ -1047,6 +1367,13 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, + * block so no check is necessary + */ + while (num_frames--) { ++ if (num_frames == 0) { ++ /* it's not always necessary, we just don't want to ++ * detect hash collision again */ ++ ext4_htree_dx_need_lock(lck); ++ ext4_htree_dx_lock(lck, p->at); ++ } ++ + bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); + if (IS_ERR(bh)) + return PTR_ERR(bh); +@@ -1055,6 +1382,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } ++ ext4_htree_de_lock(lck, p->at); + return 1; + } + +@@ -1216,10 +1544,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- frame = dx_probe(NULL, dir, &hinfo, frames); ++ /* assume it's PR locked */ ++ frame = dx_probe(NULL, dir, &hinfo, frames, NULL); + if (IS_ERR(frame)) + return PTR_ERR(frame); +- + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; +@@ -1259,7 +1587,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + count += ret; + hashval = ~0; + ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, +- frame, frames, &hashval); ++ frame, frames, &hashval, NULL); + *next_hash = hashval; + if (ret < 0) { + err = ret; +@@ -1579,7 +1907,7 @@ static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, + static struct buffer_head *__ext4_find_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir, +- int *inlined) ++ int *inlined, struct htree_lock *lck) + { + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; +@@ -1621,7 +1949,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, + goto restart; + } + if (is_dx(dir)) { +- ret = ext4_dx_find_entry(dir, fname, res_dir); ++ ret = ext4_dx_find_entry(dir, fname, res_dir, lck); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the +@@ -1631,6 +1959,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, + goto cleanup_and_exit; + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); ++ ext4_htree_safe_relock(lck); + ret = NULL; + } + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); +@@ -1721,10 +2050,10 @@ cleanup_and_exit: + return ret; + } + +-static struct buffer_head *ext4_find_entry(struct inode *dir, ++struct buffer_head *ext4_find_entry_locked(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, +- int *inlined) ++ int *inlined, struct htree_lock *lck) + { + int err; + struct ext4_filename fname; +@@ -1736,12 +2065,14 @@ static struct buffer_head *ext4_find_entry(struct inode *dir, + if (err) + return ERR_PTR(err); + +- bh = __ext4_find_entry(dir, &fname, res_dir, inlined); ++ bh = __ext4_find_entry(dir, &fname, res_dir, inlined, lck); + + ext4_fname_free_filename(&fname); + return bh; + } + ++EXPORT_SYMBOL(ext4_find_entry_locked); ++ + static struct buffer_head *ext4_lookup_entry(struct inode *dir, + struct dentry *dentry, + struct ext4_dir_entry_2 **res_dir) +@@ -1757,7 +2088,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, + if (err) + return ERR_PTR(err); + +- bh = __ext4_find_entry(dir, &fname, res_dir, NULL); ++ bh = __ext4_find_entry(dir, &fname, res_dir, NULL, NULL); + + ext4_fname_free_filename(&fname); + return bh; +@@ -1765,7 +2096,8 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, + + static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **res_dir) ++ struct ext4_dir_entry_2 **res_dir, ++ struct htree_lock *lck) + { + struct super_block * sb = dir->i_sb; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; +@@ -1776,7 +2108,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + #ifdef CONFIG_FS_ENCRYPTION + *res_dir = NULL; + #endif +- frame = dx_probe(fname, dir, NULL, frames); ++ frame = dx_probe(fname, dir, NULL, frames, lck); + if (IS_ERR(frame)) + return (struct buffer_head *) frame; + do { +@@ -1798,7 +2130,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + + /* Check to see if we should continue to search */ + retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, +- frames, NULL); ++ frames, NULL, lck); + if (retval < 0) { + ext4_warning_inode(dir, + "error %d reading directory index block", +@@ -1987,8 +2319,9 @@ static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, + * Returns pointer to de in block into which the new entry will be inserted. + */ + static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, +- struct buffer_head **bh,struct dx_frame *frame, +- struct dx_hash_info *hinfo) ++ struct buffer_head **bh, struct dx_frame *frames, ++ struct dx_frame *frame, struct dx_hash_info *hinfo, ++ struct htree_lock *lck) + { + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned continued; +@@ -2065,8 +2398,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ +- de2 = dx_move_dirents(dir, data1, data2, map + split, count - split, +- blocksize); ++ if (hinfo->hash < hash2) { ++ de2 = dx_move_dirents(dir, data1, data2, map + split, ++ count - split, blocksize); ++ } else { ++ /* make sure we will add entry to the same block which ++ * we have already locked */ ++ de2 = dx_move_dirents(dir, data1, data2, map, split, blocksize); ++ } + de = dx_pack_dirents(dir, data1, blocksize); + de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - + (char *) de, +@@ -2084,12 +2423,21 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2, + blocksize, 1)); + +- /* Which block gets the new entry? */ +- if (hinfo->hash >= hash2) { +- swap(*bh, bh2); +- de = de2; ++ ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL, ++ frame->at); /* notify block is being split */ ++ if (hinfo->hash < hash2) { ++ dx_insert_block(frame, hash2 + continued, newblock); ++ ++ } else { ++ /* switch block number */ ++ dx_insert_block(frame, hash2 + continued, ++ dx_get_block(frame->at)); ++ dx_set_block(frame->at, newblock); ++ (frame->at)++; + } +- dx_insert_block(frame, hash2 + continued, newblock); ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_unlock(lck); ++ + err = ext4_handle_dirty_dirblock(handle, dir, bh2); + if (err) + goto journal_error; +@@ -2388,7 +2736,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + if (retval) + goto out_frames; + +- de = do_split(handle,dir, &bh2, frame, &fname->hinfo); ++ de = do_split(handle, dir, &bh2, frames, frame, &fname->hinfo, NULL); + if (IS_ERR(de)) { + retval = PTR_ERR(de); + goto out_frames; +@@ -2497,8 +2845,8 @@ out: + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +-static int ext4_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++int ext4_add_entry_locked(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck) + { + struct inode *dir = d_inode(dentry->d_parent); + struct buffer_head *bh = NULL; +@@ -2547,9 +2895,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, + if (dentry->d_name.len == 2 && + memcmp(dentry->d_name.name, "..", 2) == 0) + return ext4_update_dotdot(handle, dentry, inode); +- retval = ext4_dx_add_entry(handle, &fname, dir, inode); ++ retval = ext4_dx_add_entry(handle, &fname, dir, inode, lck); + if (!retval || (retval != ERR_BAD_DX_DIR)) + goto out; ++ ext4_htree_safe_relock(lck); + /* Can we just ignore htree data? */ + if (ext4_has_metadata_csum(sb)) { + EXT4_ERROR_INODE(dir, +@@ -2612,12 +2961,14 @@ out: + ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); + return retval; + } ++EXPORT_SYMBOL(ext4_add_entry_locked); + + /* + * Returns 0 for success, or a negative error value + */ + static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, +- struct inode *dir, struct inode *inode) ++ struct inode *dir, struct inode *inode, ++ struct htree_lock *lck) + { + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries, *at; +@@ -2629,7 +2980,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + + again: + restart = 0; +- frame = dx_probe(fname, dir, NULL, frames); ++ frame = dx_probe(fname, dir, NULL, frames, lck); + if (IS_ERR(frame)) + return PTR_ERR(frame); + entries = frame->entries; +@@ -2664,6 +3015,12 @@ again: + struct dx_node *node2; + struct buffer_head *bh2; + ++ if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */ ++ ext4_htree_safe_relock(lck); ++ restart = 1; ++ goto cleanup; ++ } ++ + while (frame > frames) { + if (dx_get_count((frame - 1)->entries) < + dx_get_limit((frame - 1)->entries)) { +@@ -2767,8 +3124,32 @@ again: + restart = 1; + goto journal_error; + } ++ } else if (!ext4_htree_dx_locked(lck)) { ++ struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck); ++ ++ /* not well protected, require DX lock */ ++ ext4_htree_dx_need_lock(lck); ++ at = frame > frames ? (frame - 1)->at : NULL; ++ ++ /* NB: no risk of deadlock because it's just a try. ++ * ++ * NB: we check ld_count for twice, the first time before ++ * having DX lock, the second time after holding DX lock. ++ * ++ * NB: We never free blocks for directory so far, which ++ * means value returned by dx_get_count() should equal to ++ * ld->ld_count if nobody split any DE-block under @at, ++ * and ld->ld_at still points to valid dx_entry. */ ++ if ((ld->ld_count != dx_get_count(entries)) || ++ !ext4_htree_dx_lock_try(lck, at) || ++ (ld->ld_count != dx_get_count(entries))) { ++ restart = 1; ++ goto cleanup; ++ } ++ /* OK, I've got DX lock and nothing changed */ ++ frame->at = ld->ld_at; + } +- de = do_split(handle, dir, &bh, frame, &fname->hinfo); ++ de = do_split(handle, dir, &bh, frames, frame, &fname->hinfo, lck); + if (IS_ERR(de)) { + err = PTR_ERR(de); + goto cleanup; +@@ -2779,6 +3160,8 @@ again: + journal_error: + ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ + cleanup: ++ ext4_htree_dx_unlock(lck); ++ ext4_htree_de_unlock(lck); + brelse(bh); + dx_release(frames, dir); + /* @restart is true means htree-path has been changed, we need to +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index a2fcbf8..82ea5f6 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1291,6 +1291,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) + + inode_set_iversion(&ei->vfs_inode, 1); + spin_lock_init(&ei->i_raw_lock); ++ sema_init(&ei->i_append_sem, 1); + INIT_LIST_HEAD(&ei->i_prealloc_list); + atomic_set(&ei->i_prealloc_active, 0); + spin_lock_init(&ei->i_prealloc_lock); +-- +2.34.1 + --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-5.14.21-sles15sp4.series @@ -0,0 +1,30 @@ +rhel8/ext4-inode-version.patch +linux-5.4/ext4-lookup-dotdot.patch +linux-5.14/ext4-print-inum-in-htree-warning.patch +linux-5.14/ext4-prealloc.patch +ubuntu18/ext4-osd-iop-common.patch +sles15sp4/ext4-misc.patch +linux-5.14/ext4-mballoc-extra-checks.patch +sles15sp4/ext4-hash-indexed-dir-dotdot-update.patch +linux-5.14/ext4-kill-dx-root.patch +linux-5.8/ext4-mballoc-pa-free-mismatch.patch +sles15sp4/ext4-data-in-dirent.patch +rhel8/ext4-nocmtime.patch +base/ext4-htree-lock.patch +sles15sp4/ext4-pdirop.patch +linux-5.8/ext4-max-dir-size.patch +linux-5.14/ext4-corrupted-inode-block-bitmaps-handling-patches.patch +linux-5.10/ext4-give-warning-with-dir-htree-growing.patch +ubuntu18/ext4-jcb-optimization.patch +linux-5.10/ext4-attach-jinode-in-writepages.patch +rhel8/ext4-dont-check-before-replay.patch +rhel7.6/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7.6/ext4-export-orphan-add.patch +linux-5.14/ext4-export-mb-stream-allocator-variables.patch +ubuntu19/ext4-iget-with-flags.patch +linux-5.14/export-ext4fs-dirhash-helper.patch +linux-5.8/ext4-no-max-dir-size-limit-for-iam-objects.patch +linux-5.14/ext4-ialloc-uid-gid-and-pass-owner-down.patch +linux-5.14/ext4-projid-xattrs.patch +base/ext4-delayed-iput.patch +linux-5.14/ext4-xattr-disable-credits-check.patch --- a/lustre/osd-ldiskfs/osd_compat.c +++ b/lustre/osd-ldiskfs/osd_compat.c @@ -810,7 +810,8 @@ update: * making it as invisible temporary may be not worse. OI scrub * will process it later. */ - rc = ldiskfs_journal_get_write_access(th, bh); + rc = osd_ldiskfs_journal_get_write_access(th, parent->i_sb, bh, + LDISKFS_JTR_NONE); if (rc != 0) GOTO(out, rc); --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -2325,9 +2325,11 @@ out: * This is 28 bytes per object which is 28MB for 1M objects ... no so bad. */ #ifdef __LDISKFS_DIR_REC_LEN -#define PER_OBJ_USAGE __LDISKFS_DIR_REC_LEN(20) +# define PER_OBJ_USAGE __LDISKFS_DIR_REC_LEN(20) +#elif defined LDISKFS_DIR_REC_LEN_WITH_DIR +# define PER_OBJ_USAGE LDISKFS_DIR_REC_LEN(20, NULL) #else -#define PER_OBJ_USAGE LDISKFS_DIR_REC_LEN(20) +# define PER_OBJ_USAGE LDISKFS_DIR_REC_LEN(20) #endif /* @@ -5822,8 +5824,9 @@ static int __osd_ea_add_rec(struct osd_t bh = osd_ldiskfs_find_entry(pobj->oo_inode, &child->d_name, &de, NULL, hlock); if (!IS_ERR(bh)) { - rc1 = ldiskfs_journal_get_write_access(oth->ot_handle, - bh); + rc1 = osd_ldiskfs_journal_get_write_access( + oth->ot_handle, pobj->oo_inode->i_sb, bh, + LDISKFS_JTR_NONE); if (rc1 == 0) { if (S_ISDIR(cinode->i_mode)) de->file_type = LDISKFS_DIRENT_LUFID | @@ -7151,12 +7154,22 @@ static int osd_it_ea_key_size(const stru #if defined LDISKFS_DIR_ENTRY_LEN && defined LDISKFS_DIR_ENTRY_LEN_ #undef LDISKFS_DIR_REC_LEN -#define LDISKFS_DIR_REC_LEN(de) LDISKFS_DIR_ENTRY_LEN_((de)) +# if defined LDISKFS_DIR_REC_LEN_WITH_DIR +# define LDISKFS_DIR_REC_LEN(de, dir) LDISKFS_DIR_ENTRY_LEN_((de), (dir)) +# else +# define LDISKFS_DIR_REC_LEN(de) LDISKFS_DIR_ENTRY_LEN_((de)) +# endif +#endif + +#if defined LDISKFS_DIR_REC_LEN_WITH_DIR +# define LDISKFS_DIR_REC_LEN_DIR(de) LDISKFS_DIR_REC_LEN((de), NULL) +#else +# define LDISKFS_DIR_REC_LEN_DIR(de) LDISKFS_DIR_REC_LEN((de)) #endif static inline bool osd_dotdot_has_space(struct ldiskfs_dir_entry_2 *de) { - if (LDISKFS_DIR_REC_LEN(de) >= + if (LDISKFS_DIR_REC_LEN_DIR(de) >= __LDISKFS_DIR_REC_LEN(2 + 1 + sizeof(struct osd_fid_pack))) return true; @@ -7199,7 +7212,8 @@ osd_dirent_reinsert(const struct lu_env /* There is enough space to hold the FID-in-dirent. */ if (osd_dirent_has_space(de, namelen, dir->i_sb->s_blocksize, dotdot)) { - rc = ldiskfs_journal_get_write_access(jh, bh); + rc = osd_ldiskfs_journal_get_write_access(jh, dir->i_sb, bh, + LDISKFS_JTR_NONE); if (rc != 0) RETURN(rc); --- a/lustre/osd-ldiskfs/osd_iam.c +++ b/lustre/osd-ldiskfs/osd_iam.c @@ -539,10 +539,12 @@ static int iam_txn_add(handle_t *handle, struct iam_path *path, struct buffer_head *bh) { int result; + struct super_block *sb = iam_path_obj(path)->i_sb; - result = ldiskfs_journal_get_write_access(handle, bh); + result = osd_ldiskfs_journal_get_write_access(handle, sb, bh, + LDISKFS_JTR_NONE); if (result != 0) - ldiskfs_std_error(iam_path_obj(path)->i_sb, result); + ldiskfs_std_error(sb, result); return result; } @@ -1573,7 +1575,9 @@ iam_new_node(handle_t *h, struct iam_con head = (struct iam_idle_head *)(c->ic_idle_bh->b_data); count = le16_to_cpu(head->iih_count); if (count > 0) { - *e = ldiskfs_journal_get_write_access(h, c->ic_idle_bh); + *e = osd_ldiskfs_journal_get_write_access(h, inode->i_sb, + c->ic_idle_bh, + LDISKFS_JTR_NONE); if (*e != 0) goto fail; @@ -1601,7 +1605,9 @@ iam_new_node(handle_t *h, struct iam_con idle_blocks = (__u32 *)(c->ic_root_bh->b_data + c->ic_descr->id_root_gap + sizeof(struct dx_countlimit)); - *e = ldiskfs_journal_get_write_access(h, c->ic_root_bh); + *e = osd_ldiskfs_journal_get_write_access(h, inode->i_sb, + c->ic_root_bh, + LDISKFS_JTR_NONE); if (*e != 0) goto fail; @@ -1631,7 +1637,8 @@ iam_new_node(handle_t *h, struct iam_con got: /* get write access for the found buffer head */ - *e = ldiskfs_journal_get_write_access(h, bh); + *e = osd_ldiskfs_journal_get_write_access(h, inode->i_sb, bh, + LDISKFS_JTR_NONE); if (*e != 0) { brelse(bh); bh = NULL; @@ -1904,14 +1911,20 @@ int split_index_node(handle_t *handle, s } do_corr(schedule()); BUFFER_TRACE(frame->bh, "get_write_access"); - err = ldiskfs_journal_get_write_access(handle, frame->bh); + err = osd_ldiskfs_journal_get_write_access(handle, + dir->i_sb, + frame->bh, + LDISKFS_JTR_NONE); if (err) goto journal_error; } /* Add "safe" node to transaction too */ if (safe + 1 != path->ip_frames) { do_corr(schedule()); - err = ldiskfs_journal_get_write_access(handle, safe->bh); + err = osd_ldiskfs_journal_get_write_access(handle, + dir->i_sb, + safe->bh, + LDISKFS_JTR_NONE); if (err) goto journal_error; } --- a/lustre/osd-ldiskfs/osd_iam_lvar.c +++ b/lustre/osd-ldiskfs/osd_iam_lvar.c @@ -165,26 +165,15 @@ static inline struct lvar_leaf_entry *e_ #define LVAR_HASH_R5 (0) #define LVAR_HASH_PREFIX (0) -#ifdef HAVE_LDISKFSFS_GETHASH_INODE_ARG -/* - * NOTE: doing this breaks on file systems configured with - * case-insensitive file name lookups - * - * kernel 5.2 commit b886ee3e778ec2ad43e276fd378ab492cf6819b7 - * ext4: Support case-insensitive file name lookups - * - * FUTURE: - * We need to pass the struct inode *dir down to hash_build0 - * to enable case-insensitive file name support ext4/ldiskfs - */ -#define e_ldiskfsfs_dirhash(name, len, info) \ - __ldiskfsfs_dirhash(name, len, info) +#ifdef HAVE_LDISKFSFS_DIRHASH_WITH_DIR +#define e_ldiskfsfs_dirhash(dir, name, len, info) \ + ldiskfsfs_dirhash((dir), (name), (len), (info)) #else -#define e_ldiskfsfs_dirhash(name, len, info) \ - ldiskfsfs_dirhash(name, len, info) +#define e_ldiskfsfs_dirhash(dir, name, len, info) \ + ldiskfsfs_dirhash((name), (len), (info)) #endif -static u32 hash_build0(const char *name, int namelen) +static u32 hash_build0(const struct inode *dir, const char *name, int namelen) { u32 result; @@ -204,14 +193,14 @@ static u32 hash_build0(const char *name, hinfo.hash_version = LDISKFS_DX_HASH_TEA; hinfo.seed = NULL; - e_ldiskfsfs_dirhash(name, namelen, &hinfo); + e_ldiskfsfs_dirhash(dir, name, namelen, &hinfo); result = hinfo.hash; if (LVAR_HASH_SANDWICH) { u32 result2; hinfo.hash_version = LDISKFS_DX_HASH_TEA; hinfo.seed = NULL; - e_ldiskfsfs_dirhash(name, namelen, &hinfo); + e_ldiskfsfs_dirhash(dir, name, namelen, &hinfo); result2 = hinfo.hash; result = (0xfc000000 & result2) | (0x03ffffff & result); } @@ -224,20 +213,28 @@ enum { HASH_MAX_SIZE = 0x7fffffffUL }; -static u32 hash_build(const char *name, int namelen) +static u32 hash_build(const struct inode *dir, const char *name, int namelen) { u32 hash; - hash = (hash_build0(name, namelen) << 1) & HASH_MAX_SIZE; + hash = (hash_build0(dir, name, namelen) << 1) & HASH_MAX_SIZE; if (hash > HASH_MAX_SIZE - HASH_GRAY_AREA) hash &= HASH_GRAY_AREA - 1; return hash; } -static inline lvar_hash_t get_hash(const struct iam_container *bag, +static inline lvar_hash_t get_hash(const struct inode *dir, const char *name, int namelen) { - return hash_build(name, namelen); + return hash_build(dir, name, namelen); +} + +static inline lvar_hash_t iam_get_hash(const struct iam_leaf *leaf, + const char *name, int namelen) +{ + struct iam_path *iam_path = iam_leaf_path(leaf); + + return get_hash(iam_path_obj(iam_path), name, namelen); } static inline int e_eq(const struct lvar_leaf_entry *ent, @@ -310,6 +307,7 @@ static int n_at_rec(const struct iam_lea static int n_invariant(const struct iam_leaf *leaf) { struct iam_path *path; + struct inode *dir; struct lvar_leaf_entry *scan; struct lvar_leaf_entry *end; lvar_hash_t hash; @@ -323,6 +321,7 @@ static int n_invariant(const struct iam_ if (h_used(n_head(leaf)) > blocksize(leaf)) return 0; + dir = iam_path_obj(iam_path); /* * Delimiting key in the parent index node. Clear least bit to account * for hash collision marker. @@ -330,8 +329,7 @@ static int n_invariant(const struct iam_ starthash = *(lvar_hash_t *)iam_ikey_at(path, path->ip_frame->at) & ~1; for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) { nexthash = e_hash(scan); - if (nexthash != get_hash(iam_leaf_container(leaf), - e_char(scan), e_keysize(scan))) { + if (nexthash != get_hash(dir, e_char(scan), e_keysize(scan))) { BREAKPOINT(); return 0; } @@ -455,7 +453,7 @@ static int lvar_lookup(struct iam_leaf * name = kchar(k); namelen = strlen(name); - hash = get_hash(iam_leaf_container(leaf), name, namelen); + hash = iam_get_hash(leaf, name, namelen); found = NULL; found_equal = 0; last = 1; @@ -551,7 +549,7 @@ static int lvar_key_cmp(const struct iam name = kchar(k); - hash = get_hash(iam_leaf_container(l), name, strlen(name)); + hash = iam_get_hash(l, name, strlen(name)); return e_cmp(l, n_cur(l), hash); } @@ -647,8 +645,7 @@ static void lvar_rec_add(struct iam_leaf } h_used_adj(leaf, n_head(leaf), shift); n_cur(leaf)->vle_keysize = cpu_to_le16(ksize); - n_cur(leaf)->vle_hash = cpu_to_le32(get_hash(iam_leaf_container(leaf), - key, ksize)); + n_cur(leaf)->vle_hash = cpu_to_le32(iam_get_hash(leaf, key, ksize)); __lvar_key_set(leaf, k); __lvar_rec_set(leaf, r); assert_corr(n_at_rec(leaf)); @@ -894,7 +891,7 @@ static int lvar_node_load(struct iam_pat if (path->ip_ikey_target == NULL) { path->ip_ikey_target = iam_path_ikey(path, 4); *(lvar_hash_t *)path->ip_ikey_target = - get_hash(path->ip_container, name, + get_hash(iam_path_obj(path), name, strlen(name)); } } --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -1508,6 +1508,15 @@ bool bio_integrity_enabled(struct bio *b # define bio_get_queue(bio) (bio_get_disk(bio)->queue) #endif +#ifdef HAVE_EXT4_JOURNAL_GET_WRITE_ACCESS_4ARGS +# define osd_ldiskfs_journal_get_write_access(handle, sb, bh, flags) \ + ldiskfs_journal_get_write_access((handle), (sb), (bh), (flags)) +#else +# define LDISKFS_JTR_NONE 0 +# define osd_ldiskfs_journal_get_write_access(handle, sb, bh, flags) \ + ldiskfs_journal_get_write_access((handle), (bh)) +#endif /* HAVE_EXT4_JOURNAL_GET_WRITE_ACCESS_4ARGS */ + #ifdef HAVE_EXT4_INC_DEC_COUNT_2ARGS #define osd_ldiskfs_inc_count(h, inode) ldiskfs_inc_count((h), (inode)) #define osd_ldiskfs_dec_count(h, inode) ldiskfs_dec_count((h), (inode)) @@ -1636,7 +1645,7 @@ struct osd_bio_private { int osd_get_integrity_profile(struct osd_device *osd, integrity_gen_fn **generate_fn, integrity_vrfy_fn **verify_fn); -#endif /* HAVE_BIO_INTEGRITY_PREP_FN */ +#endif /* HAVE_BIO_INTEGRITY_PREP_FN */ /* HAVE_EXT4_INC_DEC_COUNT_2ARGS */ #ifdef HAVE_BIO_BI_PHYS_SEGMENTS #define osd_bio_nr_segs(bio) ((bio)->bi_phys_segments) --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -2127,7 +2127,9 @@ static int osd_ldiskfs_write_record(stru break; } - err = ldiskfs_journal_get_write_access(handle, bh); + err = osd_ldiskfs_journal_get_write_access(handle, inode->i_sb, + bh, + LDISKFS_JTR_NONE); if (err) { CERROR("journal_get_write_access() returned error %d\n", err);
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor