Skip to content

Latest commit

 

History

History
291 lines (229 loc) · 7.14 KB

2015-04-23-ext3-ext4-directory-difference.md

File metadata and controls

291 lines (229 loc) · 7.14 KB
layout title date categories tags excerpt
post
EXT4/EXT3文件系统目录的一点区别
2015-04-23 13:56:30 -0700
Linux
存储
EXT4/EXT3文件系统目录的一点区别。

问题描述

业务同学在测试TDocker发现一个奇怪的问题,对一个目录调用lseek(fd,0,SEEK_END)时返回一个很大的数字9223372036854775807(0x 7FFFFFFFFFFFFFFF)

而在其它非TDocker机器返回的为4096。

#./calc_dir_size /tmp
file: cron_log, size: 9223372036854775807

为什么会这个差别呢?

原因分析

用strace跟踪一下,的确lseek返回9223372036854775807:

open("cron_log", O_RDONLY)              = 3
fstat(3, {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f338b3a9000
fstat(3, {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
lseek(3, 0, SEEK_END)                   = 9223372036854775807
lseek(3, 0, SEEK_SET)                   = 0
close(3)                                = 0

由于Docker对rootfs默认使用ext4,而其它非Docker机器一般为ext3。初步断定是由于系统差别引起的。

Trace kernel,发现对于ext4,lseek会调用ext4_dir_llseek:

  1)               |  vfs_llseek() {
  1)               |    ext4_dir_llseek() {
  1)               |      mutex_lock() {
  1)   0.185 us    |        _cond_resched();
  1)   0.922 us    |      }
  1)   0.182 us    |      mutex_unlock();
  1)   1.648 us    |    }
  1)   2.159 us    |  }
const struct file_operations ext4_dir_operations = {
	.llseek		= ext4_dir_llseek,
	.read		= generic_read_dir,
	.readdir	= ext4_readdir,
	.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl	= ext4_compat_ioctl,
#endif
	.fsync		= ext4_sync_file,
	.release	= ext4_release_dir,
};

来看ext4_dir_llseek的实现:

loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin)
{
 struct inode *inode = file->f_mapping->host;
 loff_t ret = -EINVAL;
 int dx_dir = is_dx_dir(inode);
 mutex_lock(&inode->i_mutex);
 /* NOTE: relative offsets with dx directories might not work
  *       as expected, as it is difficult to figure out the
  *       correct offset between dx hashes */
 switch (origin) {
 case SEEK_END:
  if (unlikely(offset > 0))
   goto out_err; /* not supported for directories */
  /* so only negative offsets are left, does that have a
   * meaning for directories at all? */
  if (dx_dir)
   offset += ext4_get_htree_eof(file);
  else
   offset += inode->i_size;
...
 
/*
 * Return 32- or 64-bit end-of-file for dx directories
 */
static inline loff_t ext4_get_htree_eof(struct file *filp)
{
 if ((filp->f_mode & FMODE_32BITHASH) ||
     (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
  return EXT4_HTREE_EOF_32BIT;
 else
  return EXT4_HTREE_EOF_64BIT;
}
 
 
/* 32 and 64 bit signed EOF for dx directories */
#define EXT4_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
#define EXT4_HTREE_EOF_64BIT   ((1ULL << (64 - 1)) - 1)

由于ext4默认开启htree-indexed directory,参考这里:

# tune2fs -l /dev/mapper/docker-8:4-xxx|grep "Filesystem features"
Filesystem features:      has_journal ext_attr resize_inode dir_index filetype needs_recovery extent flex_bg sparse_super large_file huge_file uninit_bg dir_nlink extra_isize

所以,ext4_dir_llseek会返回EXT4_HTREE_EOF_64BIT,即0x 7FFFFFFFFFFFFFFF。

为什么ext3会返回4096呢?

const struct file_operations ext3_dir_operations = {
	.llseek		= generic_file_llseek,
	.read		= generic_read_dir,
	.readdir	= ext3_readdir,		/* we take BKL. needed?*/
	.unlocked_ioctl	= ext3_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl	= ext3_compat_ioctl,
#endif
	.fsync		= ext3_sync_file,	/* BKL held */
	.release	= ext3_release_dir,
};

loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
{
	loff_t rval;

	mutex_lock(&file->f_dentry->d_inode->i_mutex);
	rval = generic_file_llseek_unlocked(file, offset, origin);
	mutex_unlock(&file->f_dentry->d_inode->i_mutex);

	return rval;
}

loff_t
generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
{
	struct inode *inode = file->f_mapping->host;

	switch (origin) {
	case SEEK_END:
		offset += inode->i_size;
		break;
...

从代码可以看到,与ext4不同的是,ext3调用的是generic_file_llseek,而后者直接返回inode->i_size。(至于为是4096,这里不再讨论)

问题就到这里结束了吗?

tlinux

在分析原因的过程中,发现在tlinux内核的机器上,对于ext4目录,lseek仍然返回4096。

const struct file_operations ext4_dir_operations = {
    .llseek     = generic_file_llseek,
    .read       = generic_read_dir,
    .readdir    = ext4_readdir,     /* we take BKL. needed?*/
    .unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
    .compat_ioctl   = ext4_compat_ioctl,
#endif
    .fsync      = ext4_sync_file,
    .release    = ext4_release_dir,
};

原来tlinux内核下的ext4也是调用generic_file_llseek。

XFS

对于XFS,lseek返回0,与stat的结果不同:

#./calc_dir_size /data/dir1 
file: /data/dir1, size: 0
#stat /data/dir1
  File: `/data/dir1'
  Size: 6               Blocks: 0          IO Block: 4096   directory
Device: 804h/2052d      Inode: 1879049233  Links: 2

这是因为在XFS中fstat与lseek取的值不同造成的。

fstat

vfs_fstat -> vfs_getattr -> xfs_vn_getattr:

STATIC int
xfs_vn_getattr(
  struct vfsmount   *mnt,
  struct dentry   *dentry,
  struct kstat    *stat)
{
  struct inode    *inode = dentry->d_inode;
  struct xfs_inode  *ip = XFS_I(inode);
  struct xfs_mount  *mp = ip->i_mount;

  trace_xfs_getattr(ip);

  if (XFS_FORCED_SHUTDOWN(mp))
    return -XFS_ERROR(EIO);

  stat->size = XFS_ISIZE(ip);
...

static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
{
  if (S_ISREG(ip->i_d.di_mode))
    return i_size_read(VFS_I(ip));
  return ip->i_d.di_size;
}

lseek

const struct file_operations xfs_dir_file_operations = {
  .open   = xfs_dir_open,
  .read   = generic_read_dir,
  .readdir  = xfs_file_readdir,
  .llseek   = generic_file_llseek,
  .unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
  .compat_ioctl = xfs_file_compat_ioctl,
#endif
  .fsync    = xfs_dir_fsync,
};

可以看到,XFS的lseek,也是调用的generic_file_llseek,返回inode->i_size。

总结

对目录调用lseek,不同的文件系统返回的值可能并不一样,这由文件系统自身的实现决定。如果业务希望保持ext3的传统,ext4下可以关闭dir_index特性:

# tune2fs -O ^dir_index /dev/mapper/docker-8:4-xxx

# 附测试程序

#include <iostream>
using namespace std;

long GetFileSize(const char* szFileName) 
{
  FILE* pFile = fopen(szFileName, "r");
  if (NULL == pFile)
  {
    cout << "fopen failed. file: " << szFileName << endl;
    return -1;
  }

  long lLength = 0;
  fseek(pFile, 0, SEEK_END);
  lLength = ftell(pFile);
  fseek(pFile, 0, SEEK_SET);

  fclose(pFile);

  return lLength;
}

int main(int argc, char* argv[]) 
{
  if (argc < 2)
  {
    cout << "Usage: " << argv[0] << " filepath" << endl;
    return 1;
  }


  cout << "file: " << argv[1] << ", size: " << GetFileSize(argv[1]) << endl;

  return 0;
}