Discussion:
[Btrfs-devel] cloning file data
Sage Weil
2008-04-24 22:47:08 UTC
Permalink
Hi-

I'm working on a clone ioctl that will quickly and efficiently duplicate
the contents of a file, e.g.

int main(int argc, const char **argv)
{
int in = open(argv[1], O_RDONLY);
int out = open(argv[2], O_CREAT|O_TRUNC|O_WRONLY, 0644);
ioctl(out, BTRFS_IOC_CLONE, in);
close(in);
close(out);
return 0;
}

I've probably got the locking order a bit wrong, lots of error handling is
missing, and I suspect there's a cleaner way to do the target inode size
update, but it behaves well enough in my (limited :) testing.

Oh, and I wasn't certain the 'offset' in file_extent_item could be safely
ignored when duplicating the extent reference. My assumption was that it
is orthogonal to extent allocation and isn't related to the backref.
However, btrfs_insert_file_extent() always set offset=0. I'm guessing I
need to add an argument there and fix up the other callers?

Anyway, any comments or suggestions (on the interface or implemantation)
are welcome.. :)

sage


diff -r 1791a620d509 inode.c
--- a/inode.c Thu Apr 24 13:43:27 2008 -0700
+++ b/inode.c Thu Apr 24 15:10:17 2008 -0700
@@ -18,6 +18,7 @@

#include <linux/bio.h>
#include <linux/buffer_head.h>
+#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
@@ -2726,6 +2727,158 @@ long btrfs_ioctl_trans_end(struct file *
return 0;
}

+void dup_item_to_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *leaf,
+ int slot,
+ struct btrfs_key *key,
+ u64 destino)
+{
+ struct btrfs_path *cpath = btrfs_alloc_path();
+ int len = btrfs_item_size_nr(leaf, slot);
+ int dstoff;
+ struct btrfs_key ckey = *key;
+ int ret;
+
+ ckey.objectid = destino;
+ ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len);
+ dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]);
+ copy_extent_buffer(cpath->nodes[0], leaf, dstoff,
+ btrfs_item_ptr_offset(leaf, slot),
+ len);
+ btrfs_release_path(root, cpath);
+}
+
+long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct file *src_file;
+ struct inode *src;
+ struct btrfs_trans_handle *trans;
+ int ret;
+ u64 pos;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ u32 nritems;
+ int nextret;
+ int slot;
+
+ src_file = fget(src_fd);
+ if (!src_file)
+ return -EBADF;
+ src = src_file->f_dentry->d_inode;
+
+ ret = -EXDEV;
+ if (src->i_sb != inode->i_sb)
+ goto out_fput;
+
+ if (inode < src) {
+ mutex_lock(&inode->i_mutex);
+ mutex_lock(&src->i_mutex);
+ } else {
+ mutex_lock(&src->i_mutex);
+ mutex_lock(&inode->i_mutex);
+ }
+
+ ret = -ENOTEMPTY;
+ if (inode->i_size)
+ goto out_unlock;
+
+ /* do any pending delalloc/csum calc on src, one way or another */
+ filemap_write_and_wait(src->i_mapping);
+
+ mutex_lock(&root->fs_info->fs_mutex);
+ trans = btrfs_start_transaction(root, 0);
+ path = btrfs_alloc_path();
+ pos = 0;
+ while (1) {
+ ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino,
+ pos, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ if (path->slots[0] == 0) {
+ ret = 0;
+ goto out;
+ }
+ path->slots[0]--;
+ }
+ next_slot:
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ printk("key(%llu %x %llu)\n",
+ key.objectid, key.type, key.offset);
+ if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
+ key.objectid != src->i_ino)
+ goto out;
+ if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+ struct btrfs_file_extent_item *extent;
+ int found_type;
+ u64 len;
+ pos = key.offset;
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(leaf, extent);
+ len = btrfs_file_extent_num_bytes(leaf, extent);
+ if (found_type == BTRFS_FILE_EXTENT_REG) {
+ u64 ds = btrfs_file_extent_disk_bytenr(leaf,
+ extent);
+ u64 dl = btrfs_file_extent_disk_num_bytes(leaf,
+ extent);
+ printk(" %llu~%llu disk %llu~%llu off %llu\n",
+ pos, len, ds, dl,
+ btrfs_file_extent_offset(leaf, extent));
+ btrfs_insert_file_extent(trans, root,
+ inode->i_ino, pos,
+ ds, dl, len);
+ btrfs_inc_extent_ref(trans, root, ds, dl,
+ root->root_key.objectid,
+ trans->transid,
+ inode->i_ino, pos);
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE)
+ dup_item_to_inode(trans, root, path, leaf, slot,
+ &key, inode->i_ino);
+ pos = key.offset + len;
+ } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY)
+ dup_item_to_inode(trans, root, path, leaf, slot, &key,
+ inode->i_ino);
+
+ nritems = btrfs_header_nritems(leaf);
+ if (slot >= nritems - 1) {
+ nextret = btrfs_next_leaf(root, path);
+ if (nextret)
+ goto out;
+ } else {
+ path->slots[0]++;
+ }
+ goto next_slot;
+ }
+
+out:
+ ret = 0;
+ mutex_unlock(&root->fs_info->fs_mutex);
+
+ i_size_write(inode, src->i_size);
+ inode->i_blocks = src->i_blocks;
+ mark_inode_dirty(inode);
+
+ mutex_lock(&root->fs_info->fs_mutex);
+ btrfs_end_transaction(trans, root);
+ mutex_unlock(&root->fs_info->fs_mutex);
+
+out_unlock:
+ mutex_unlock(&src->i_mutex);
+ mutex_unlock(&inode->i_mutex);
+out_fput:
+ fput(src_file);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -2744,6 +2897,9 @@ long btrfs_ioctl(struct file *file, unsi
return btrfs_ioctl_trans_end(file);
case BTRFS_IOC_SYNC:
btrfs_sync_fs(file->f_dentry->d_sb, 1);
+ return 0;
+ case BTRFS_IOC_CLONE:
+ btrfs_ioctl_clone(file, arg);
return 0;
}

diff -r 1791a620d509 ioctl.h
--- a/ioctl.h Thu Apr 24 13:43:27 2008 -0700
+++ b/ioctl.h Thu Apr 24 15:10:17 2008 -0700
@@ -36,5 +36,6 @@ struct btrfs_ioctl_vol_args {
#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
+#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
#endif
Chris Mason
2008-04-25 13:41:35 UTC
Permalink
Post by Sage Weil
Hi-
I'm working on a clone ioctl that will quickly and efficiently duplicate
the contents of a file, e.g.
Very cool. I'd actually loved to see this wrapped into a program that will
cow a directory tree. Basically the same as cp -al, but with cow instead of
linking.
Post by Sage Weil
int main(int argc, const char **argv)
{
int in = open(argv[1], O_RDONLY);
int out = open(argv[2], O_CREAT|O_TRUNC|O_WRONLY, 0644);
ioctl(out, BTRFS_IOC_CLONE, in);
close(in);
close(out);
return 0;
}
I've probably got the locking order a bit wrong, lots of error handling is
missing, and I suspect there's a cleaner way to do the target inode size
update, but it behaves well enough in my (limited :) testing.
Oh, and I wasn't certain the 'offset' in file_extent_item could be safely
ignored when duplicating the extent reference. My assumption was that it
is orthogonal to extent allocation and isn't related to the backref.
However, btrfs_insert_file_extent() always set offset=0. I'm guessing I
need to add an argument there and fix up the other callers?
Yes, you need to preserve the offset. There's only one place right now that
sets a non-zero offset and it inserts the extent by hand for other reasons
(if you're brave, file.c:btrfs_drop_extents)

The reason file extents have an offset field is to allow COW without
read/modify/write. Picture something like this:

# create a single 100MB extent in file foo
dd if=/dev/zero of=foo bs=1M count=100
sync

# write into the middle
dd if=/dev/zero of=foo bs=4k count=1 seek=100 conv=notrunc
sync

We've written into the middle of that 100MB extent, and we need to do COW.
One option is to read the whole thing, change 4k and write it all back.
Instead, btrfs does something like this (+/- off by need more coffee errors):

file pos = 0 -> [ old extent, offset = 0, num_bytes = 400k ]
file pos = 409600 -> [ new 4k extent, offset = 0, num_bytes = 4k ]
file pos = 413696 -> [ old extent, offset = 413696, num_bytes = 100MB - 404k]

An extra reference is taken on the old extent to reflect that we're pointing
to it twice.
Post by Sage Weil
Anyway, any comments or suggestions (on the interface or implemantation)
are welcome.. :)
By taking the inode mutex, you protect against file_write and truncates
changing the file. But, we also need to prevent mmaps from changing the file
pages as well. What you want to do lock all the file bytes in the extent
tree:

lock_extent(&BTRFS_I(src_inode)->io_tree, 0, (u64)-1, GFP_NOFS);

But unfortunately, the code to fill delayed allocation takes that same lock.
So you need to loop a bit:

while(1) {
filemap_write_and_wait(src_inode);
lock_extent()
if (BTRFS_I(src_inode)->delalloc_bytes == 0)
break;
unlock_extent()
}

That should keep you from racing with btrfs_page_mkwrite()

-chris
Zach Brown
2008-04-25 16:50:36 UTC
Permalink
Post by Chris Mason
We've written into the middle of that 100MB extent, and we need to do COW.
One option is to read the whole thing, change 4k and write it all back.
file pos = 0 -> [ old extent, offset = 0, num_bytes = 400k ]
file pos = 409600 -> [ new 4k extent, offset = 0, num_bytes = 4k ]
file pos = 413696 -> [ old extent, offset = 413696, num_bytes = 100MB - 404k]
An extra reference is taken on the old extent to reflect that we're pointing
to it twice.
If you learn how to parse the debug-tree output then this can be seen
pretty easily. To do this we can watch the leaves of the fs tree for
the inode and extent items of the file we work with:

# dd if=/dev/zero bs=1M count=1k of=/tmp/image
# losetup /dev/loop0 /tmp/image
# ./mkfs.btrfs /dev/loop0
# mount -t btrfs /dev/loop0 /mnt/btrfs

# dd if=/dev/zero bs=64M count=1 of=/mnt/btrfs/test
# sync

# ./debug-tree /tmp/image

item 5 key (256 11 258) itemoff 3779 itemsize 26
dir index 258 type 1
namelen 4 datalen 0 name: test
[...]
item 1 key (258 1 0) itemoff 2699 itemsize 108
inode generation 0 size 67108864 [...]
[...]
item 3 key (258 12 0) itemoff 2652 itemsize 41
extent data disk byte 190382080 nr 67108864
extent data offset 0 nr 67108864

In the root directory we found a dirent for our test file which shows it
has objectid 258, then we found its inode with size=64m and the file
extent which references the 64m extent on disk which starts at byte
offset 190382080.

So now we over-write a 4k region in the file at offset 64k.

# dd if=/dev/zero bs=4k count=1 seek=16 of=/mnt/btrfs/test conv=notrunc
# sync

# ./debug-tree /tmp/image

item 1 key (258 1 0) itemoff 2699 itemsize 108
inode generation 0 size 67108864 [...]
[...]
item 3 key (258 12 0) itemoff 2652 itemsize 41
extent data disk byte 190382080 nr 67108864
extent data offset 0 nr 65536
item 4 key (258 12 65536) itemoff 2611 itemsize 41
extent data disk byte 257490944 nr 4096
extent data offset 0 nr 4096
item 5 key (258 12 69632) itemoff 2570 itemsize 41
extent data disk byte 190382080 nr 67108864
extent data offset 69632 nr 67039232

We still have the same inode, and it has the same size, but its extent
items look very different. The extent for the first 64k looks much the
same. It references the old 64m extent on disk. But see the 'nr
65536', it only maps 64k of that 64m into the file. Then we have the 4k
extent that we just wrote. Then we have another reference to that 64m
extent but for the remaining data after the new 4k.

The extra credit assignment is to observe the effect of these extent
reference item changes on the reference count items which are stored
over in the leaves of the extent allocation tree.

debug-tree is fantastic, but it can be kind of intimidating if you don't
already know what all the numbers mean :). Reducing the barrier to
understanding its output might be a great project for someone interested
in learning the disk format without having to learn how to work with the
kernel code.

- z
Chris Mason
2008-04-25 16:58:20 UTC
Permalink
Post by Zach Brown
Post by Chris Mason
We've written into the middle of that 100MB extent, and we need to do
COW. One option is to read the whole thing, change 4k and write it all
back. Instead, btrfs does something like this (+/- off by need more
file pos = 0 -> [ old extent, offset = 0, num_bytes = 400k ]
file pos = 409600 -> [ new 4k extent, offset = 0, num_bytes = 4k ]
file pos = 413696 -> [ old extent, offset = 413696, num_bytes = 100MB - 404k]
An extra reference is taken on the old extent to reflect that we're
pointing to it twice.
If you learn how to parse the debug-tree output then this can be seen
pretty easily. To do this we can watch the leaves of the fs tree for
# dd if=/dev/zero bs=1M count=1k of=/tmp/image
# losetup /dev/loop0 /tmp/image
# ./mkfs.btrfs /dev/loop0
# mount -t btrfs /dev/loop0 /mnt/btrfs
# dd if=/dev/zero bs=64M count=1 of=/mnt/btrfs/test
# sync
# ./debug-tree /tmp/image
Running debug-tree on a live FS is a very good way to learn about trees that
get left around while snapshot deletion is happening and cache aliasing
caused by the way Btrfs puts metadata into its own address space.

But, if you're trying to learn the disk format, I'd stick an unmount between
the dd and the debug-tree ;)

-chris
Zach Brown
2008-04-25 17:04:43 UTC
Permalink
Post by Chris Mason
Running debug-tree on a live FS is a very good way to learn about trees that
get left around while snapshot deletion is happening and cache aliasing
caused by the way Btrfs puts metadata into its own address space.
But, if you're trying to learn the disk format, I'd stick an unmount between
the dd and the debug-tree ;)
Haha, true, true. If you don't know to watch for confusing racey
results, indeed, serialize around each dump :).

- z
Zach Brown
2008-04-25 16:50:42 UTC
Permalink
Post by Chris Mason
We've written into the middle of that 100MB extent, and we need to do COW.
One option is to read the whole thing, change 4k and write it all back.
file pos = 0 -> [ old extent, offset = 0, num_bytes = 400k ]
file pos = 409600 -> [ new 4k extent, offset = 0, num_bytes = 4k ]
file pos = 413696 -> [ old extent, offset = 413696, num_bytes = 100MB - 404k]
An extra reference is taken on the old extent to reflect that we're pointing
to it twice.
If you learn how to parse the debug-tree output then this can be seen
pretty easily. To do this we can watch the leaves of the fs tree for
the inode and extent items of the file we work with:

# dd if=/dev/zero bs=1M count=1k of=/tmp/image
# losetup /dev/loop0 /tmp/image
# ./mkfs.btrfs /dev/loop0
# mount -t btrfs /dev/loop0 /mnt/btrfs

# dd if=/dev/zero bs=64M count=1 of=/mnt/btrfs/test
# sync

# ./debug-tree /tmp/image

item 5 key (256 11 258) itemoff 3779 itemsize 26
dir index 258 type 1
namelen 4 datalen 0 name: test
[...]
item 1 key (258 1 0) itemoff 2699 itemsize 108
inode generation 0 size 67108864 [...]
[...]
item 3 key (258 12 0) itemoff 2652 itemsize 41
extent data disk byte 190382080 nr 67108864
extent data offset 0 nr 67108864

In the root directory we found a dirent for our test file which shows it
has objectid 258, then we found its inode with size=64m and the file
extent which references the 64m extent on disk which starts at byte
offset 190382080.

So now we over-write a 4k region in the file at offset 64k.

# dd if=/dev/zero bs=4k count=1 seek=16 of=/mnt/btrfs/test conv=notrunc
# sync

# ./debug-tree /tmp/image

item 1 key (258 1 0) itemoff 2699 itemsize 108
inode generation 0 size 67108864 [...]
[...]
item 3 key (258 12 0) itemoff 2652 itemsize 41
extent data disk byte 190382080 nr 67108864
extent data offset 0 nr 65536
item 4 key (258 12 65536) itemoff 2611 itemsize 41
extent data disk byte 257490944 nr 4096
extent data offset 0 nr 4096
item 5 key (258 12 69632) itemoff 2570 itemsize 41
extent data disk byte 190382080 nr 67108864
extent data offset 69632 nr 67039232

We still have the same inode, and it has the same size, but its extent
items look very different. The extent for the first 64k looks much the
same. It references the old 64m extent on disk. But see the 'nr
65536', it only maps 64k of that 64m into the file. Then we have the 4k
extent that we just wrote. Then we have another reference to that 64m
extent but for the remaining data after the new 4k.

The extra credit assignment is to observe the effect of these extent
reference item changes on the reference count items which are stored
over in the leaves of the extent allocation tree.

debug-tree is fantastic, but it can be kind of intimidating if you don't
already know what all the numbers mean :). Reducing the barrier to
understanding its output might be a great project for someone interested
in learning the disk format without having to learn how to work with the
kernel code.

- z
Sage Weil
2008-04-25 18:32:49 UTC
Permalink
Post by Zach Brown
We still have the same inode, and it has the same size, but its extent
items look very different. The extent for the first 64k looks much the
same. It references the old 64m extent on disk. But see the 'nr
65536', it only maps 64k of that 64m into the file. Then we have the 4k
extent that we just wrote. Then we have another reference to that 64m
extent but for the remaining data after the new 4k.
Is there anything in the defragger (or whatever) that looks for minimally
referenced extents? Once can imagine a situation where only a small piece
of a large extent is remains referenced, but that information is buried in
the forward reference(s).
Post by Zach Brown
debug-tree is fantastic, but it can be kind of intimidating if you don't
already know what all the numbers mean :). Reducing the barrier to
Yep. Although in my case, the biggest stumbling block was realizing that
the key type
Post by Zach Brown
item 3 key (258 12 0) itemoff 2652 itemsize 41
^^
is in printed in hex for some reason. Der.

sage
Sage Weil
2008-04-25 18:26:20 UTC
Permalink
Post by Chris Mason
Very cool. I'd actually loved to see this wrapped into a program that will
cow a directory tree. Basically the same as cp -al, but with cow instead of
linking.
Yeah definitely. I added a -c/--cow flag to GNU cp, but am having trouble
coercing autotools into even building on my box. I'll fiddle with it a
little later. Basically, it just tries the ioctl, and goes into the
regular copy read/write loop if that fails.
Post by Chris Mason
Post by Sage Weil
However, btrfs_insert_file_extent() always set offset=0. I'm guessing I
need to add an argument there and fix up the other callers?
Yes, you need to preserve the offset. There's only one place right now that
sets a non-zero offset and it inserts the extent by hand for other reasons
(if you're brave, file.c:btrfs_drop_extents)
I see. In this case, since I'm duplicating the forward and backrefs, I
just added the offset arg to btrfs_insert_file_extent().
Post by Chris Mason
By taking the inode mutex, you protect against file_write and truncates
changing the file. But, we also need to prevent mmaps from changing the file
pages as well. What you want to do lock all the file bytes in the extent
lock_extent(&BTRFS_I(src_inode)->io_tree, 0, (u64)-1, GFP_NOFS);
But unfortunately, the code to fill delayed allocation takes that same lock.
while(1) {
filemap_write_and_wait(src_inode);
lock_extent()
if (BTRFS_I(src_inode)->delalloc_bytes == 0)
break;
unlock_extent()
}
That should keep you from racing with btrfs_page_mkwrite()
Ah, that's what I was looking for. The adjusted patch is below!

Thanks-
sage



diff -r 1791a620d509 ctree.h
--- a/ctree.h Thu Apr 24 13:43:27 2008 -0700
+++ b/ctree.h Fri Apr 25 10:12:46 2008 -0700
@@ -1135,9 +1135,9 @@ int btrfs_lookup_inode(struct btrfs_tran
/* file-item.c */
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 objectid, u64 pos, u64 offset,
+ u64 objectid, u64 pos, u64 disk_offset,
u64 disk_num_bytes,
- u64 num_bytes);
+ u64 num_bytes, u64 offset);
int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
diff -r 1791a620d509 file-item.c
--- a/file-item.c Thu Apr 24 13:43:27 2008 -0700
+++ b/file-item.c Fri Apr 25 10:12:46 2008 -0700
@@ -28,10 +28,10 @@
sizeof(struct btrfs_item) * 2) / \
BTRFS_CRC32_SIZE) - 1))
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 objectid, u64 pos,
- u64 offset, u64 disk_num_bytes,
- u64 num_bytes)
+ struct btrfs_root *root,
+ u64 objectid, u64 pos,
+ u64 disk_offset, u64 disk_num_bytes,
+ u64 num_bytes, u64 offset)
{
int ret = 0;
struct btrfs_file_extent_item *item;
@@ -53,9 +53,9 @@ int btrfs_insert_file_extent(struct btrf
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- btrfs_set_file_extent_disk_bytenr(leaf, item, offset);
+ btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
- btrfs_set_file_extent_offset(leaf, item, 0);
+ btrfs_set_file_extent_offset(leaf, item, offset);
btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
btrfs_set_file_extent_generation(leaf, item, trans->transid);
btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
diff -r 1791a620d509 file.c
--- a/file.c Thu Apr 24 13:43:27 2008 -0700
+++ b/file.c Fri Apr 25 10:12:46 2008 -0700
@@ -285,7 +285,7 @@ static int noinline dirty_and_release_pa
err = btrfs_insert_file_extent(trans, root,
inode->i_ino,
last_pos_in_file,
- 0, 0, hole_size);
+ 0, 0, hole_size, 0);
btrfs_drop_extent_cache(inode, last_pos_in_file,
last_pos_in_file + hole_size -1);
btrfs_check_file(root, inode);
diff -r 1791a620d509 inode.c
--- a/inode.c Thu Apr 24 13:43:27 2008 -0700
+++ b/inode.c Fri Apr 25 10:12:46 2008 -0700
@@ -18,6 +18,7 @@

#include <linux/bio.h>
#include <linux/buffer_head.h>
+#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
@@ -134,7 +135,7 @@ static int cow_file_range(struct inode *
}
ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
start, ins.objectid, ins.offset,
- ins.offset);
+ ins.offset, 0);
inode->i_blocks += ins.offset >> 9;
btrfs_check_file(root, inode);
num_bytes -= cur_alloc_size;
@@ -1046,7 +1047,7 @@ static int btrfs_setattr(struct dentry *
err = btrfs_insert_file_extent(trans, root,
inode->i_ino,
hole_start, 0, 0,
- hole_size);
+ hole_size, 0);
btrfs_drop_extent_cache(inode, hole_start,
hole_size - 1);
btrfs_check_file(root, inode);
@@ -2726,6 +2727,168 @@ long btrfs_ioctl_trans_end(struct file *
return 0;
}

+void dup_item_to_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *leaf,
+ int slot,
+ struct btrfs_key *key,
+ u64 destino)
+{
+ struct btrfs_path *cpath = btrfs_alloc_path();
+ int len = btrfs_item_size_nr(leaf, slot);
+ int dstoff;
+ struct btrfs_key ckey = *key;
+ int ret;
+
+ ckey.objectid = destino;
+ ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len);
+ dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]);
+ copy_extent_buffer(cpath->nodes[0], leaf, dstoff,
+ btrfs_item_ptr_offset(leaf, slot),
+ len);
+ btrfs_release_path(root, cpath);
+}
+
+long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct file *src_file;
+ struct inode *src;
+ struct btrfs_trans_handle *trans;
+ int ret;
+ u64 pos;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ u32 nritems;
+ int nextret;
+ int slot;
+
+ src_file = fget(src_fd);
+ if (!src_file)
+ return -EBADF;
+ src = src_file->f_dentry->d_inode;
+
+ ret = -EXDEV;
+ if (src->i_sb != inode->i_sb)
+ goto out_fput;
+
+ if (inode < src) {
+ mutex_lock(&inode->i_mutex);
+ mutex_lock(&src->i_mutex);
+ } else {
+ mutex_lock(&src->i_mutex);
+ mutex_lock(&inode->i_mutex);
+ }
+
+ ret = -ENOTEMPTY;
+ if (inode->i_size)
+ goto out_unlock;
+
+ /* do any pending delalloc/csum calc on src, one way or
+ another, and lock file content */
+ while (1) {
+ filemap_write_and_wait(src->i_mapping);
+ lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+ if (BTRFS_I(src)->delalloc_bytes == 0)
+ break;
+ unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+ }
+
+ mutex_lock(&root->fs_info->fs_mutex);
+ trans = btrfs_start_transaction(root, 0);
+ path = btrfs_alloc_path();
+ pos = 0;
+ while (1) {
+ ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino,
+ pos, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ if (path->slots[0] == 0) {
+ ret = 0;
+ goto out;
+ }
+ path->slots[0]--;
+ }
+ next_slot:
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ printk("key(%llu %x %llu)\n",
+ key.objectid, key.type, key.offset);
+ if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
+ key.objectid != src->i_ino)
+ goto out;
+ if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+ struct btrfs_file_extent_item *extent;
+ int found_type;
+ u64 len;
+ pos = key.offset;
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(leaf, extent);
+ len = btrfs_file_extent_num_bytes(leaf, extent);
+ if (found_type == BTRFS_FILE_EXTENT_REG) {
+ u64 ds = btrfs_file_extent_disk_bytenr(leaf,
+ extent);
+ u64 dl = btrfs_file_extent_disk_num_bytes(leaf,
+ extent);
+ u64 off = btrfs_file_extent_offset(leaf,
+ extent);
+ printk(" %llu~%llu disk %llu~%llu off %llu\n",
+ pos, len, ds, dl, off);
+ btrfs_insert_file_extent(trans, root,
+ inode->i_ino, pos,
+ ds, dl, len, off);
+ btrfs_inc_extent_ref(trans, root, ds, dl,
+ root->root_key.objectid,
+ trans->transid,
+ inode->i_ino, pos);
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE)
+ dup_item_to_inode(trans, root, path, leaf, slot,
+ &key, inode->i_ino);
+ pos = key.offset + len;
+ } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY)
+ dup_item_to_inode(trans, root, path, leaf, slot, &key,
+ inode->i_ino);
+
+ nritems = btrfs_header_nritems(leaf);
+ if (slot >= nritems - 1) {
+ nextret = btrfs_next_leaf(root, path);
+ if (nextret)
+ goto out;
+ } else {
+ path->slots[0]++;
+ }
+ goto next_slot;
+ }
+
+out:
+ ret = 0;
+ mutex_unlock(&root->fs_info->fs_mutex);
+
+ i_size_write(inode, src->i_size);
+ inode->i_blocks = src->i_blocks;
+ mark_inode_dirty(inode);
+
+ unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+
+ mutex_lock(&root->fs_info->fs_mutex);
+ btrfs_end_transaction(trans, root);
+ mutex_unlock(&root->fs_info->fs_mutex);
+
+out_unlock:
+ mutex_unlock(&src->i_mutex);
+ mutex_unlock(&inode->i_mutex);
+out_fput:
+ fput(src_file);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -2744,6 +2907,9 @@ long btrfs_ioctl(struct file *file, unsi
return btrfs_ioctl_trans_end(file);
case BTRFS_IOC_SYNC:
btrfs_sync_fs(file->f_dentry->d_sb, 1);
+ return 0;
+ case BTRFS_IOC_CLONE:
+ btrfs_ioctl_clone(file, arg);
return 0;
}

diff -r 1791a620d509 ioctl.h
--- a/ioctl.h Thu Apr 24 13:43:27 2008 -0700
+++ b/ioctl.h Fri Apr 25 10:12:46 2008 -0700
@@ -36,5 +36,6 @@ struct btrfs_ioctl_vol_args {
#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
+#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
#endif
Sage Weil
2008-04-26 04:38:40 UTC
Permalink
Hi-

Couple small fixes. Cloning a linux kernel tree (cp -ac) takes 5-6
seconds (cp -al is 3-4 seconds, cp -a is 15-20 seconds).

sage



diff -r 1791a620d509 ctree.h
--- a/ctree.h Thu Apr 24 13:43:27 2008 -0700
+++ b/ctree.h Fri Apr 25 21:14:25 2008 -0700
@@ -1135,9 +1135,9 @@ int btrfs_lookup_inode(struct btrfs_tran
/* file-item.c */
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 objectid, u64 pos, u64 offset,
+ u64 objectid, u64 pos, u64 disk_offset,
u64 disk_num_bytes,
- u64 num_bytes);
+ u64 num_bytes, u64 offset);
int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
diff -r 1791a620d509 file-item.c
--- a/file-item.c Thu Apr 24 13:43:27 2008 -0700
+++ b/file-item.c Fri Apr 25 21:14:25 2008 -0700
@@ -28,10 +28,10 @@
sizeof(struct btrfs_item) * 2) / \
BTRFS_CRC32_SIZE) - 1))
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 objectid, u64 pos,
- u64 offset, u64 disk_num_bytes,
- u64 num_bytes)
+ struct btrfs_root *root,
+ u64 objectid, u64 pos,
+ u64 disk_offset, u64 disk_num_bytes,
+ u64 num_bytes, u64 offset)
{
int ret = 0;
struct btrfs_file_extent_item *item;
@@ -53,9 +53,9 @@ int btrfs_insert_file_extent(struct btrf
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- btrfs_set_file_extent_disk_bytenr(leaf, item, offset);
+ btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
- btrfs_set_file_extent_offset(leaf, item, 0);
+ btrfs_set_file_extent_offset(leaf, item, offset);
btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
btrfs_set_file_extent_generation(leaf, item, trans->transid);
btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
diff -r 1791a620d509 file.c
--- a/file.c Thu Apr 24 13:43:27 2008 -0700
+++ b/file.c Fri Apr 25 21:14:25 2008 -0700
@@ -285,7 +285,7 @@ static int noinline dirty_and_release_pa
err = btrfs_insert_file_extent(trans, root,
inode->i_ino,
last_pos_in_file,
- 0, 0, hole_size);
+ 0, 0, hole_size, 0);
btrfs_drop_extent_cache(inode, last_pos_in_file,
last_pos_in_file + hole_size -1);
btrfs_check_file(root, inode);
diff -r 1791a620d509 inode.c
--- a/inode.c Thu Apr 24 13:43:27 2008 -0700
+++ b/inode.c Fri Apr 25 21:14:25 2008 -0700
@@ -18,6 +18,7 @@

#include <linux/bio.h>
#include <linux/buffer_head.h>
+#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
@@ -134,7 +135,7 @@ static int cow_file_range(struct inode *
}
ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
start, ins.objectid, ins.offset,
- ins.offset);
+ ins.offset, 0);
inode->i_blocks += ins.offset >> 9;
btrfs_check_file(root, inode);
num_bytes -= cur_alloc_size;
@@ -1046,7 +1047,7 @@ static int btrfs_setattr(struct dentry *
err = btrfs_insert_file_extent(trans, root,
inode->i_ino,
hole_start, 0, 0,
- hole_size);
+ hole_size, 0);
btrfs_drop_extent_cache(inode, hole_start,
hole_size - 1);
btrfs_check_file(root, inode);
@@ -2726,6 +2727,168 @@ long btrfs_ioctl_trans_end(struct file *
return 0;
}

+void dup_item_to_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *leaf,
+ int slot,
+ struct btrfs_key *key,
+ u64 destino)
+{
+ struct btrfs_path *cpath = btrfs_alloc_path();
+ int len = btrfs_item_size_nr(leaf, slot);
+ int dstoff;
+ struct btrfs_key ckey = *key;
+ int ret;
+
+ ckey.objectid = destino;
+ ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len);
+ dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]);
+ copy_extent_buffer(cpath->nodes[0], leaf, dstoff,
+ btrfs_item_ptr_offset(leaf, slot),
+ len);
+ btrfs_release_path(root, cpath);
+}
+
+long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct file *src_file;
+ struct inode *src;
+ struct btrfs_trans_handle *trans;
+ int ret;
+ u64 pos;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ u32 nritems;
+ int nextret;
+ int slot;
+
+ src_file = fget(src_fd);
+ if (!src_file)
+ return -EBADF;
+ src = src_file->f_dentry->d_inode;
+
+ ret = -EXDEV;
+ if (src->i_sb != inode->i_sb)
+ goto out_fput;
+
+ if (inode < src) {
+ mutex_lock(&inode->i_mutex);
+ mutex_lock(&src->i_mutex);
+ } else {
+ mutex_lock(&src->i_mutex);
+ mutex_lock(&inode->i_mutex);
+ }
+
+ ret = -ENOTEMPTY;
+ if (inode->i_size)
+ goto out_unlock;
+
+ /* do any pending delalloc/csum calc on src, one way or
+ another, and lock file content */
+ while (1) {
+ filemap_write_and_wait(src->i_mapping);
+ lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+ if (BTRFS_I(src)->delalloc_bytes == 0)
+ break;
+ unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+ }
+
+ mutex_lock(&root->fs_info->fs_mutex);
+ trans = btrfs_start_transaction(root, 0);
+ path = btrfs_alloc_path();
+ pos = 0;
+ while (1) {
+ ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino,
+ pos, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ if (path->slots[0] == 0) {
+ ret = 0;
+ goto out;
+ }
+ path->slots[0]--;
+ }
+ next_slot:
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ nritems = btrfs_header_nritems(leaf);
+
+ if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
+ key.objectid != src->i_ino)
+ goto out;
+ if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+ struct btrfs_file_extent_item *extent;
+ int found_type;
+ pos = key.offset;
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(leaf, extent);
+ if (found_type == BTRFS_FILE_EXTENT_REG) {
+ u64 len = btrfs_file_extent_num_bytes(leaf,
+ extent);
+ u64 ds = btrfs_file_extent_disk_bytenr(leaf,
+ extent);
+ u64 dl = btrfs_file_extent_disk_num_bytes(leaf,
+ extent);
+ u64 off = btrfs_file_extent_offset(leaf,
+ extent);
+ btrfs_insert_file_extent(trans, root,
+ inode->i_ino, pos,
+ ds, dl, len, off);
+ btrfs_inc_extent_ref(trans, root, ds, dl,
+ root->root_key.objectid,
+ trans->transid,
+ inode->i_ino, pos);
+ pos = key.offset + len;
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ dup_item_to_inode(trans, root, path, leaf, slot,
+ &key, inode->i_ino);
+ pos = key.offset + btrfs_item_size_nr(leaf,
+ slot);
+ }
+ } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY)
+ dup_item_to_inode(trans, root, path, leaf, slot, &key,
+ inode->i_ino);
+
+ if (slot >= nritems - 1) {
+ nextret = btrfs_next_leaf(root, path);
+ if (nextret)
+ goto out;
+ } else {
+ path->slots[0]++;
+ }
+ goto next_slot;
+ }
+
+out:
+ btrfs_free_path(path);
+ ret = 0;
+ mutex_unlock(&root->fs_info->fs_mutex);
+
+ i_size_write(inode, src->i_size);
+ inode->i_blocks = src->i_blocks;
+ mark_inode_dirty(inode);
+
+ unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+
+ mutex_lock(&root->fs_info->fs_mutex);
+ btrfs_end_transaction(trans, root);
+ mutex_unlock(&root->fs_info->fs_mutex);
+
+out_unlock:
+ mutex_unlock(&src->i_mutex);
+ mutex_unlock(&inode->i_mutex);
+out_fput:
+ fput(src_file);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -2744,6 +2907,9 @@ long btrfs_ioctl(struct file *file, unsi
return btrfs_ioctl_trans_end(file);
case BTRFS_IOC_SYNC:
btrfs_sync_fs(file->f_dentry->d_sb, 1);
+ return 0;
+ case BTRFS_IOC_CLONE:
+ btrfs_ioctl_clone(file, arg);
return 0;
}

diff -r 1791a620d509 ioctl.h
--- a/ioctl.h Thu Apr 24 13:43:27 2008 -0700
+++ b/ioctl.h Fri Apr 25 21:14:25 2008 -0700
@@ -36,5 +36,6 @@ struct btrfs_ioctl_vol_args {
#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
+#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
#endif
Yan Zheng
2008-05-03 04:44:08 UTC
Permalink
Hello Sage,

I think the clone ioctl won't work in some corner case. The big loop
in btrfs_ioctl_clone uses path->slots[0]++ and btrfs_next_leaf to get
next item in the tree. However, this approach works only when the
layout of tree keeps unchangeed. In btrfs_ioctl_clone, both
btrfs_insert_file_extent and dup_item_to_inode may change the layout
of tree.

To be safe, I think the codes should:
use btrfs_search_slot to find next item.
use a intermediate buffer when coping item between two extent buffer.

Regards
YZ
Sage Weil
2008-05-03 06:16:20 UTC
Permalink
Hi Yan-
Post by Yan Zheng
I think the clone ioctl won't work in some corner case. The big loop
in btrfs_ioctl_clone uses path->slots[0]++ and btrfs_next_leaf to get
next item in the tree. However, this approach works only when the
layout of tree keeps unchangeed. In btrfs_ioctl_clone, both
btrfs_insert_file_extent and dup_item_to_inode may change the layout
of tree.
use btrfs_search_slot to find next item.
use a intermediate buffer when coping item between two extent buffer.
Oh, right. I think for the item copy, though, we just need to re-search
for the source key again after doing the insert_empty_item. Then we can
still use copy_extent_buffer, since at that point both paths will be
valid?

Something like the below (untested) patch. I suspect I didn't hit this
because I was always cloning 'file' to something like 'file2' that always
sorted after the src file, and didn't shift its position in the leaf. I'll
try to test this in the next few days.

Thanks-
sage


diff -r f6ba18a50ad7 inode.c
--- a/inode.c Fri May 02 16:13:49 2008 -0400
+++ b/inode.c Fri May 02 23:11:49 2008 -0700
@@ -3103,25 +3103,27 @@ out:

void dup_item_to_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *leaf,
- int slot,
- struct btrfs_key *key,
+ struct btrfs_path *srcpath,
+ struct btrfs_key *srckey,
u64 destino)
{
- struct btrfs_path *cpath = btrfs_alloc_path();
- int len = btrfs_item_size_nr(leaf, slot);
- int dstoff;
- struct btrfs_key ckey = *key;
+ struct btrfs_key dstkey = *srckey;
+ struct btrfs_path *dstpath = btrfs_alloc_path();
+ int len = btrfs_item_size_nr(srcpath->nodes[0],
+ srcpath->slots[0]);
int ret;

- ckey.objectid = destino;
- ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len);
- dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]);
- copy_extent_buffer(cpath->nodes[0], leaf, dstoff,
- btrfs_item_ptr_offset(leaf, slot),
+ dstkey.objectid = destino;
+ ret = btrfs_insert_empty_item(trans, root, dstpath, &dstkey, len);
+ /* re-search for src key, in case we changed srcpath */
+ ret = btrfs_search_slot(trans, root, srckey, srcpath, 0, 0);
+ copy_extent_buffer(dstpath->nodes[0], srcpath->nodes[0],
+ btrfs_item_ptr_offset(dstpath->nodes[0],
+ dstpath->slots[0]),
+ btrfs_item_ptr_offset(srcpath->nodes[0],
+ srcpath->slots[0]),
len);
- btrfs_release_path(root, cpath);
+ btrfs_release_path(root, dstpath);
}

long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
@@ -3137,7 +3139,6 @@ long btrfs_ioctl_clone(struct file *file
struct btrfs_key key;
struct extent_buffer *leaf;
u32 nritems;
- int nextret;
int slot;

src_file = fget(src_fd);
@@ -3178,6 +3179,8 @@ long btrfs_ioctl_clone(struct file *file
while (1) {
ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino,
pos, 0);
+
+next_slot:
if (ret < 0)
goto out;
if (ret > 0) {
@@ -3187,7 +3190,7 @@ long btrfs_ioctl_clone(struct file *file
}
path->slots[0]--;
}
-next_slot:
+
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
@@ -3225,22 +3228,19 @@ next_slot:
}
pos = key.offset + len;
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- dup_item_to_inode(trans, root, path, leaf, slot,
- &key, inode->i_ino);
+ dup_item_to_inode(trans, root, path, &key,
+ inode->i_ino);
pos = key.offset + btrfs_item_size_nr(leaf,
slot);
}
} else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY)
- dup_item_to_inode(trans, root, path, leaf, slot, &key,
+ dup_item_to_inode(trans, root, path, &key,
inode->i_ino);

- if (slot >= nritems - 1) {
- nextret = btrfs_next_leaf(root, path);
- if (nextret)
- goto out;
- } else {
- path->slots[0]++;
- }
+ /* path may not still be valid, so explicitly search
+ * for the next key */
+ key.offset = pos;
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
goto next_slot;
}
Yan Zheng
2008-05-03 06:48:02 UTC
Permalink
Post by Sage Weil
Hi Yan-
Post by Yan Zheng
I think the clone ioctl won't work in some corner case. The big loop
in btrfs_ioctl_clone uses path->slots[0]++ and btrfs_next_leaf to get
next item in the tree. However, this approach works only when the
layout of tree keeps unchangeed. In btrfs_ioctl_clone, both
btrfs_insert_file_extent and dup_item_to_inode may change the layout
of tree.
use btrfs_search_slot to find next item.
use a intermediate buffer when coping item between two extent buffer.
Oh, right. I think for the item copy, though, we just need to re-search
for the source key again after doing the insert_empty_item. Then we can
still use copy_extent_buffer, since at that point both paths will be
valid?
Something like the below (untested) patch. I suspect I didn't hit this
because I was always cloning 'file' to something like 'file2' that always
sorted after the src file, and didn't shift its position in the leaf. I'll
try to test this in the next few days.
Thanks-
sage
diff -r f6ba18a50ad7 inode.c
--- a/inode.c Fri May 02 16:13:49 2008 -0400
+++ b/inode.c Fri May 02 23:11:49 2008 -0700
void dup_item_to_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *leaf,
- int slot,
- struct btrfs_key *key,
+ struct btrfs_path *srcpath,
+ struct btrfs_key *srckey,
u64 destino)
{
- struct btrfs_path *cpath = btrfs_alloc_path();
- int len = btrfs_item_size_nr(leaf, slot);
- int dstoff;
- struct btrfs_key ckey = *key;
+ struct btrfs_key dstkey = *srckey;
+ struct btrfs_path *dstpath = btrfs_alloc_path();
+ int len = btrfs_item_size_nr(srcpath->nodes[0],
+ srcpath->slots[0]);
int ret;
- ckey.objectid = destino;
- ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len);
- dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]);
- copy_extent_buffer(cpath->nodes[0], leaf, dstoff,
- btrfs_item_ptr_offset(leaf, slot),
+ dstkey.objectid = destino;
+ ret = btrfs_insert_empty_item(trans, root, dstpath, &dstkey, len);
+ /* re-search for src key, in case we changed srcpath */
+ ret = btrfs_search_slot(trans, root, srckey, srcpath, 0, 0);
+ copy_extent_buffer(dstpath->nodes[0], srcpath->nodes[0],
+ btrfs_item_ptr_offset(dstpath->nodes[0],
+ dstpath->slots[0]),
+ btrfs_item_ptr_offset(srcpath->nodes[0],
+ srcpath->slots[0]),
len);
- btrfs_release_path(root, cpath);
+ btrfs_release_path(root, dstpath);
}
long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
@@ -3137,7 +3139,6 @@ long btrfs_ioctl_clone(struct file *file
struct btrfs_key key;
struct extent_buffer *leaf;
u32 nritems;
- int nextret;
int slot;
src_file = fget(src_fd);
@@ -3178,6 +3179,8 @@ long btrfs_ioctl_clone(struct file *file
while (1) {
ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino,
pos, 0);
+
if (ret < 0)
goto out;
if (ret > 0) {
@@ -3187,7 +3190,7 @@ long btrfs_ioctl_clone(struct file *file
}
path->slots[0]--;
}
+
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
}
pos = key.offset + len;
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- dup_item_to_inode(trans, root, path, leaf, slot,
- &key, inode->i_ino);
+ dup_item_to_inode(trans, root, path, &key,
+ inode->i_ino);
pos = key.offset + btrfs_item_size_nr(leaf,
slot);
}
} else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY)
- dup_item_to_inode(trans, root, path, leaf, slot, &key,
+ dup_item_to_inode(trans, root, path, &key,
inode->i_ino);
- if (slot >= nritems - 1) {
- nextret = btrfs_next_leaf(root, path);
- if (nextret)
- goto out;
- } else {
- path->slots[0]++;
- }
+ /* path may not still be valid, so explicitly search
+ * for the next key */
+ key.offset = pos;
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
goto next_slot;
}
key.offset isn't updated properly. you should differentiate extent
item, inline extent item and csum item.

For extent item:
key.offset += btrfs_file_extent_num_bytes(...);

For inline extent item:
u32 len = btrfs_file_extent_inline_len(...);
key.offset += ALIGN(len, root->sectorsize);

For csum item
key.offset += btrfs_item_size_nr(...) / BTRFS_CRC32_SIZE * root->sectorsize;

Regards
YZ
Yan Zheng
2008-05-03 07:25:00 UTC
Permalink
Post by Sage Weil
Hi Yan-
Post by Yan Zheng
I think the clone ioctl won't work in some corner case. The big loop
in btrfs_ioctl_clone uses path->slots[0]++ and btrfs_next_leaf to get
next item in the tree. However, this approach works only when the
layout of tree keeps unchangeed. In btrfs_ioctl_clone, both
btrfs_insert_file_extent and dup_item_to_inode may change the layout
of tree.
use btrfs_search_slot to find next item.
use a intermediate buffer when coping item between two extent buffer.
Oh, right. I think for the item copy, though, we just need to re-search
for the source key again after doing the insert_empty_item. Then we can
still use copy_extent_buffer, since at that point both paths will be
valid?
Something like the below (untested) patch. I suspect I didn't hit this
because I was always cloning 'file' to something like 'file2' that always
sorted after the src file, and didn't shift its position in the leaf. I'll
try to test this in the next few days.
Thanks-
sage
diff -r f6ba18a50ad7 inode.c
--- a/inode.c Fri May 02 16:13:49 2008 -0400
+++ b/inode.c Fri May 02 23:11:49 2008 -0700
void dup_item_to_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *leaf,
- int slot,
- struct btrfs_key *key,
+ struct btrfs_path *srcpath,
+ struct btrfs_key *srckey,
u64 destino)
{
- struct btrfs_path *cpath = btrfs_alloc_path();
- int len = btrfs_item_size_nr(leaf, slot);
- int dstoff;
- struct btrfs_key ckey = *key;
+ struct btrfs_key dstkey = *srckey;
+ struct btrfs_path *dstpath = btrfs_alloc_path();
+ int len = btrfs_item_size_nr(srcpath->nodes[0],
+ srcpath->slots[0]);
int ret;
- ckey.objectid = destino;
- ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len);
- dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]);
- copy_extent_buffer(cpath->nodes[0], leaf, dstoff,
- btrfs_item_ptr_offset(leaf, slot),
+ dstkey.objectid = destino;
+ ret = btrfs_insert_empty_item(trans, root, dstpath, &dstkey, len);
+ /* re-search for src key, in case we changed srcpath */
+ ret = btrfs_search_slot(trans, root, srckey, srcpath, 0, 0);
+ copy_extent_buffer(dstpath->nodes[0], srcpath->nodes[0],
+ btrfs_item_ptr_offset(dstpath->nodes[0],
+ dstpath->slots[0]),
+ btrfs_item_ptr_offset(srcpath->nodes[0],
+ srcpath->slots[0]),
len);
- btrfs_release_path(root, cpath);
+ btrfs_release_path(root, dstpath);
}
long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
@@ -3137,7 +3139,6 @@ long btrfs_ioctl_clone(struct file *file
struct btrfs_key key;
struct extent_buffer *leaf;
u32 nritems;
- int nextret;
int slot;
src_file = fget(src_fd);
@@ -3178,6 +3179,8 @@ long btrfs_ioctl_clone(struct file *file
while (1) {
ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino,
pos, 0);
+
if (ret < 0)
goto out;
if (ret > 0) {
@@ -3187,7 +3190,7 @@ long btrfs_ioctl_clone(struct file *file
}
path->slots[0]--;
}
+
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
}
pos = key.offset + len;
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- dup_item_to_inode(trans, root, path, leaf, slot,
- &key, inode->i_ino);
+ dup_item_to_inode(trans, root, path, &key,
+ inode->i_ino);
pos = key.offset + btrfs_item_size_nr(leaf,
slot);
}
} else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY)
- dup_item_to_inode(trans, root, path, leaf, slot, &key,
+ dup_item_to_inode(trans, root, path, &key,
inode->i_ino);
- if (slot >= nritems - 1) {
- nextret = btrfs_next_leaf(root, path);
- if (nextret)
- goto out;
- } else {
- path->slots[0]++;
- }
+ /* path may not still be valid, so explicitly search
+ * for the next key */
+ key.offset = pos;
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
goto next_slot;
}
In my previous mail, I said items of different types should be
differentiated. Actually, there is no need to do that. Please consider
changing the big loop in btrfs_ioctl_clone to something like:
---
key.objectid = src->i_ino;
key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
while (1) {
ret = btrfs_search_slot(trans, root, &key, &path, 0, 0);
if (ret < 0)
goto fail;

leaf = path.nodes[0];
if (path.slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(extent_root, &path);
if (ret < 0)
goto fail;
if (ret > 0)
break;
leaf = path.nodes[0];
}

btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);

...
do some works
...

btrfs_release_path(root, path);
key.offset++;
}

---
Regards
YZ
Chris Mason
2008-05-05 10:27:46 UTC
Permalink
Post by Yan Zheng
Post by Sage Weil
Hi Yan-
Post by Yan Zheng
I think the clone ioctl won't work in some corner case. The big loop
in btrfs_ioctl_clone uses path->slots[0]++ and btrfs_next_leaf to get
next item in the tree. However, this approach works only when the
layout of tree keeps unchangeed. In btrfs_ioctl_clone, both
btrfs_insert_file_extent and dup_item_to_inode may change the layout
of tree.
use btrfs_search_slot to find next item.
use a intermediate buffer when coping item between two extent buffer.
[ ... ]
Post by Yan Zheng
In my previous mail, I said items of different types should be
differentiated. Actually, there is no need to do that. Please consider
Oh, nice catch Yan, thanks. I've pushed out a new version to the unstable
tree. Sage, could you please give this a try too?

-chris
Sage Weil
2008-05-05 15:57:28 UTC
Permalink
Post by Chris Mason
Post by Yan Zheng
In my previous mail, I said items of different types should be
differentiated. Actually, there is no need to do that. Please consider
Oh, nice catch Yan, thanks. I've pushed out a new version to the unstable
tree. Sage, could you please give this a try too?
Looks good to me. Here's a small cleanup to remove the now unnecessary
pos.

sage


diff -r d94a17e354a8 inode.c
--- a/inode.c Mon May 05 06:26:21 2008 -0400
+++ b/inode.c Mon May 05 09:02:55 2008 -0700
@@ -3135,7 +3135,6 @@ long btrfs_ioctl_clone(struct file *file
struct inode *src;
struct btrfs_trans_handle *trans;
int ret;
- u64 pos;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -3183,7 +3182,6 @@ long btrfs_ioctl_clone(struct file *file
key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
key.objectid = src->i_ino;
- pos = 0;
path->reada = 2;

while (1) {
@@ -3214,7 +3212,6 @@ long btrfs_ioctl_clone(struct file *file
if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
struct btrfs_file_extent_item *extent;
int found_type;
- pos = key.offset;
extent = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item);
found_type = btrfs_file_extent_type(leaf, extent);
@@ -3228,25 +3225,22 @@ long btrfs_ioctl_clone(struct file *file
u64 off = btrfs_file_extent_offset(leaf,
extent);
btrfs_insert_file_extent(trans, root,
- inode->i_ino, pos,
+ inode->i_ino,
+ key.offset,
ds, dl, len, off);
/* ds == 0 means there's a hole */
- if (ds != 0) {
+ if (ds != 0)
btrfs_inc_extent_ref(trans, root,
ds, dl,
root->root_key.objectid,
trans->transid,
- inode->i_ino, pos);
- }
- pos = key.offset + len;
+ inode->i_ino, key.offset);
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
ret = dup_item_to_inode(trans, root, path,
leaf, slot, &key,
inode->i_ino);
if (ret)
goto out;
- pos = key.offset + btrfs_item_size_nr(leaf,
- slot);
}
} else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
ret = dup_item_to_inode(trans, root, path, leaf,

Sage Weil
2008-04-25 20:28:53 UTC
Permalink
Post by Chris Mason
Very cool. I'd actually loved to see this wrapped into a program that will
cow a directory tree. Basically the same as cp -al, but with cow instead of
linking.
Here's a pretty trivial patch against cp in coreutils-6.10. It'll
probably take you more time to build than to make the changes yourself,
but this seems much handier and more robust than a separate tool.

$ cp -ac /mnt/btrfs/from /mnt/btrfs/to
...

sage


diff -ur coreutils-6.10/src/copy.c coreutils-6.10-btrfs/src/copy.c
--- coreutils-6.10/src/copy.c 2008-01-05 14:59:11.000000000 -0800
+++ coreutils-6.10-btrfs/src/copy.c 2008-04-25 13:12:20.000000000 -0700
@@ -537,6 +537,9 @@
buf_alloc = xmalloc (buf_size + buf_alignment_slop);
buf = ptr_align (buf_alloc, buf_alignment);

+#define BTRFS_IOC_CLONE 0x40049409
+ if (!x->cow ||
+ ioctl(dest_desc, BTRFS_IOC_CLONE, source_desc) != 0)
for (;;)
{
word *wp = NULL;
diff -ur coreutils-6.10/src/copy.h coreutils-6.10-btrfs/src/copy.h
--- coreutils-6.10/src/copy.h 2008-01-05 14:58:25.000000000 -0800
+++ coreutils-6.10-btrfs/src/copy.h 2008-04-25 12:46:30.000000000 -0700
@@ -135,6 +135,8 @@
on different file systems from the one we started on. */
bool one_file_system;

+ bool cow;
+
/* If true, attempt to give the copies the original files' permissions,
owner, group, and timestamps. */
bool preserve_ownership;
diff -ur coreutils-6.10/src/cp.c coreutils-6.10-btrfs/src/cp.c
--- coreutils-6.10/src/cp.c 2008-01-11 03:19:53.000000000 -0800
+++ coreutils-6.10-btrfs/src/cp.c 2008-04-25 13:23:12.000000000 -0700
@@ -125,6 +125,7 @@
{"archive", no_argument, NULL, 'a'},
{"backup", optional_argument, NULL, 'b'},
{"copy-contents", no_argument, NULL, COPY_CONTENTS_OPTION},
+ {"cow", no_argument, NULL, 'c'},
{"dereference", no_argument, NULL, 'L'},
{"force", no_argument, NULL, 'f'},
{"interactive", no_argument, NULL, 'i'},
@@ -178,6 +179,7 @@
--backup[=CONTROL] make a backup of each existing destination file\n\
-b like --backup but does not accept an argument\n\
--copy-contents copy contents of special files when recursive\n\
+ -c, --cow attempt issuing copy-on-write ioctl to fs\n\
-d same as --no-dereference --preserve=links\n\
"), stdout);
fputs (_("\
@@ -767,6 +769,7 @@
x->interactive = I_UNSPECIFIED;
x->move_mode = false;
x->one_file_system = false;
+ x->cow = false;

x->preserve_ownership = false;
x->preserve_links = false;
@@ -909,7 +912,7 @@
we'll actually use backup_suffix_string. */
backup_suffix_string = getenv ("SIMPLE_BACKUP_SUFFIX");

- while ((c = getopt_long (argc, argv, "abdfHilLprst:uvxPRS:T",
+ while ((c = getopt_long (argc, argv, "abcdfHilLprst:uvxPRS:T",
long_opts, NULL))
!= -1)
{
@@ -940,6 +943,10 @@
copy_contents = true;
break;

+ case 'c':
+ x.cow = true;
+ break;
+
case 'd':
x.preserve_links = true;
x.dereference = DEREF_NEVER;
Chris Mason
2008-04-29 20:52:54 UTC
Permalink
Post by Sage Weil
Hi-
I'm working on a clone ioctl that will quickly and efficiently duplicate
the contents of a file, e.g.
Just FYI, I didn't sneak this into v0.14 because I didn't quite have the
cycles to test it. It'll go in this week.

-chris
Loading...