Discussion:
[Btrfs-devel] transaction ioctls
Sage Weil
2008-04-22 20:12:35 UTC
Permalink
Hi Chris,

These ioctls let a user application hold a transaction open while it
performs a series of operations. A final ioctl does a sync on the fs
(closing the current transaction). This is the main requirement for
Ceph's OSD to be able to keep the data it's storing in a btrfs volume
consistent, and AFAICS it works just fine. The application would do
something like

fd = ::open("some/file", O_RDONLY);
::ioctl(fd, BTRFS_IOC_TRANS_START);
/* do a bunch of stuff */
::ioctl(fd, BTRFS_IOC_TRANS_END);
or just
::close(fd);

And to ensure it commits to disk,

::ioctl(fd, BTRFS_IOC_SYNC);

When a transaction is held open, the trans_handle is attached to the
struct file (via private_data) so that it will get cleaned up if the
process dies unexpectedly. A held transaction is also ended on fsync() to
avoid a deadlock. There may be other places I missed?

A misbehaving application could also deliberately hold a transaction open,
effectively locking up the FS, so it may make sense to restrict something
like this to root or something.

Anyway, does this look reasonable?

Thanks-
sage



diff -r e4cd88595ed7 ctree.h
--- a/ctree.h Thu Feb 21 14:54:12 2008 -0500
+++ b/ctree.h Tue Apr 22 12:08:53 2008 -0700
@@ -1179,6 +1179,7 @@ void btrfs_destroy_inode(struct inode *i
void btrfs_destroy_inode(struct inode *inode);
int btrfs_init_cachep(void);
void btrfs_destroy_cachep(void);
+long btrfs_ioctl_trans_end(struct file *file);
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
struct btrfs_root *root);
@@ -1199,6 +1200,8 @@ int btrfs_drop_extents(struct btrfs_tran
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
u64 start, u64 end, u64 inline_limit, u64 *hint_block);
+int btrfs_file_release(struct inode *inode, struct file *file);
+
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int cache_only);
@@ -1217,4 +1220,5 @@ int btrfs_delete_xattrs(struct btrfs_tra
struct btrfs_root *root, struct inode *inode);
/* super.c */
u64 btrfs_parse_size(char *str);
+int btrfs_sync_fs(struct super_block *sb, int wait);
#endif
diff -r e4cd88595ed7 file.c
--- a/file.c Thu Feb 21 14:54:12 2008 -0500
+++ b/file.c Tue Apr 22 12:08:53 2008 -0700
@@ -941,6 +941,8 @@ static int btrfs_sync_file(struct file *
/*
* ok we haven't committed the transaction yet, lets do a commit
*/
+ if (file->private_data)
+ btrfs_ioctl_trans_end(file);
trans = btrfs_start_transaction(root, 1);
if (!trans) {
ret = -ENOMEM;
@@ -950,6 +952,13 @@ out:
out:
mutex_unlock(&root->fs_info->fs_mutex);
return ret > 0 ? EIO : ret;
+}
+
+int btrfs_file_release(struct inode *inode, struct file *file)
+{
+ if (file->private_data)
+ btrfs_ioctl_trans_end(file);
+ return 0;
}

static struct vm_operations_struct btrfs_file_vm_ops = {
@@ -980,6 +989,7 @@ struct file_operations btrfs_file_operat
.write = btrfs_file_write,
.mmap = btrfs_file_mmap,
.open = generic_file_open,
+ .release = btrfs_file_release,
.fsync = btrfs_sync_file,
.unlocked_ioctl = btrfs_ioctl,
#ifdef CONFIG_COMPAT
diff -r e4cd88595ed7 inode.c
--- a/inode.c Thu Feb 21 14:54:12 2008 -0500
+++ b/inode.c Tue Apr 22 12:08:53 2008 -0700
@@ -2690,6 +2690,42 @@ static int btrfs_ioctl_defrag(struct fil
return 0;
}

+long btrfs_ioctl_trans_start(struct file *file)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ if (file->private_data)
+ return -EINPROGRESS;
+ mutex_lock(&root->fs_info->fs_mutex);
+ trans = btrfs_start_transaction(root, 0);
+ if (trans)
+ file->private_data = trans;
+ else
+ ret = -ENOMEM;
+ /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
+ mutex_unlock(&root->fs_info->fs_mutex);
+ return ret;
+}
+
+long btrfs_ioctl_trans_end(struct file *file)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans = file->private_data;
+
+ if (!trans)
+ return -EINVAL;
+ /*printk(KERN_INFO "btrfs_ioctl_trans_end on %p\n", file);*/
+ mutex_lock(&root->fs_info->fs_mutex);
+ btrfs_end_transaction(trans, root);
+ file->private_data = 0;
+ mutex_unlock(&root->fs_info->fs_mutex);
+ return 0;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -2702,6 +2738,13 @@ long btrfs_ioctl(struct file *file, unsi
return btrfs_ioctl_defrag(file);
case BTRFS_IOC_RESIZE:
return btrfs_ioctl_resize(root, (void __user *)arg);
+ case BTRFS_IOC_TRANS_START:
+ return btrfs_ioctl_trans_start(file);
+ case BTRFS_IOC_TRANS_END:
+ return btrfs_ioctl_trans_end(file);
+ case BTRFS_IOC_SYNC:
+ btrfs_sync_fs(file->f_dentry->d_sb, 1);
+ return 0;
}

return -ENOTTY;
@@ -3019,6 +3062,7 @@ static struct file_operations btrfs_dir_
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_ioctl,
#endif
+ .release = btrfs_file_release,
};

static struct extent_io_ops btrfs_extent_io_ops = {
diff -r e4cd88595ed7 ioctl.h
--- a/ioctl.h Thu Feb 21 14:54:12 2008 -0500
+++ b/ioctl.h Tue Apr 22 12:08:53 2008 -0700
@@ -32,4 +32,9 @@ struct btrfs_ioctl_vol_args {
struct btrfs_ioctl_vol_args)
#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
struct btrfs_ioctl_vol_args)
+
+#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
+#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
+#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
#endif
+
diff -r e4cd88595ed7 super.c
--- a/super.c Thu Feb 21 14:54:12 2008 -0500
+++ b/super.c Tue Apr 22 12:08:53 2008 -0700
@@ -284,7 +284,7 @@ fail_close:
return err;
}

-static int btrfs_sync_fs(struct super_block *sb, int wait)
+int btrfs_sync_fs(struct super_block *sb, int wait)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root;
Zach Brown
2008-04-22 20:29:25 UTC
Permalink
Post by Sage Weil
A misbehaving application could also deliberately hold a transaction open,
effectively locking up the FS, so it may make sense to restrict something
like this to root or something.
I suspect it doesn't have to be deliberate.

Have you tried this under memory pressure? I wonder if the application
can get stuck waiting for memory which will only be freed once the
transaction closes.

I'm reasonably sure that we've discussed this persistent theoretical
problem with these kinds of interfaces ;).

- z
Chris Mason
2008-04-22 20:41:34 UTC
Permalink
Post by Zach Brown
Post by Sage Weil
A misbehaving application could also deliberately hold a transaction
open, effectively locking up the FS, so it may make sense to restrict
something like this to root or something.
I suspect it doesn't have to be deliberate.
Have you tried this under memory pressure? I wonder if the application
can get stuck waiting for memory which will only be freed once the
transaction closes.
This isn't as big an issue, btrfs doesn't pin pages while the transaction is
running. There is some accounting rbtrees that grow while the transaction is
running, but it isn't like a reiserfsv3 or jbd that have physical blocks on
disk pinned.
Post by Zach Brown
I'm reasonably sure that we've discussed this persistent theoretical
problem with these kinds of interfaces ;).
I do agree is isn't practical for anything other than a tightly controlled
interface. It might make sense to create specific ioctls or syscalls for the
operations you need to combine. Perhaps a generic mechanism that can link a
bunch of async syscalls together within a single framework.

Ok, really though, I seem to remember that ceph needed to do file + xattr
operations in one atomic shot, were there others?

-chris
Sage Weil
2008-04-22 20:52:53 UTC
Permalink
Post by Chris Mason
Ok, really though, I seem to remember that ceph needed to do file + xattr
operations in one atomic shot, were there others?
The transactions generally look like

write(a)
setxattr(a)
write(b)
setxattr(b)

It _could_ be broken down into write intent, do X, log X, such that the
atomicity isn't strictly necessary, but itd be so much nicer to just wrap
things up into tidy transactions.

sage
Chris Mason
2008-04-22 20:55:37 UTC
Permalink
Post by Sage Weil
Post by Chris Mason
Ok, really though, I seem to remember that ceph needed to do file + xattr
operations in one atomic shot, were there others?
The transactions generally look like
write(a)
setxattr(a)
write(b)
setxattr(b)
Hmm, is this whole thing the atomic unit, or can a and b be done separately?

-chris
Sage Weil
2008-04-22 20:56:49 UTC
Permalink
Post by Chris Mason
Post by Sage Weil
Post by Chris Mason
Ok, really though, I seem to remember that ceph needed to do file + xattr
operations in one atomic shot, were there others?
The transactions generally look like
write(a)
setxattr(a)
write(b)
setxattr(b)
Hmm, is this whole thing the atomic unit, or can a and b be done separately?
The whole thing.

sage
Evgeniy Polyakov
2008-04-22 21:05:15 UTC
Permalink
Hi.
Post by Chris Mason
Post by Sage Weil
The transactions generally look like
write(a)
setxattr(a)
write(b)
setxattr(b)
Hmm, is this whole thing the atomic unit, or can a and b be done separately?
No, main idea is to bind very different operations together and make
them look atomic from userspace point of view. But transactions are
nothing without ability to correctly unroll them on demand.
Transaction can include any operation with data and metadata.
--
Evgeniy Polyakov
Chris Mason
2008-04-23 12:50:54 UTC
Permalink
Hi.
On Tue, Apr 22, 2008 at 04:55:37PM -0400, Chris Mason
Post by Chris Mason
Post by Sage Weil
The transactions generally look like
write(a)
setxattr(a)
write(b)
setxattr(b)
Hmm, is this whole thing the atomic unit, or can a and b be done separately?
No, main idea is to bind very different operations together and make
them look atomic from userspace point of view. But transactions are
nothing without ability to correctly unroll them on demand.
Transaction can include any operation with data and metadata.
Transaction rollback from a filesystem point of view is a reboot. Real
database style transactions with rollback and isolation from other procs etc
etc are outside the scope of Btrfs.

-chris
Evgeniy Polyakov
2008-04-23 12:57:54 UTC
Permalink
Hi Chris.
Post by Chris Mason
Transaction rollback from a filesystem point of view is a reboot. Real
database style transactions with rollback and isolation from other procs etc
etc are outside the scope of Btrfs.
Why rollback is a reboot? With copy-on-write it could be possible to
just commit tree state, which was before transaction start, as a current
one and thus rollback all changes. Having that possibility from
userspace could be a great benefit, since in case of application error
it is relly simple to undo all changes.
--
Evgeniy Polyakov
Chris Mason
2008-04-23 13:07:28 UTC
Permalink
Post by Evgeniy Polyakov
Hi Chris.
On Wed, Apr 23, 2008 at 08:50:54AM -0400, Chris Mason
Post by Chris Mason
Transaction rollback from a filesystem point of view is a reboot. Real
database style transactions with rollback and isolation from other procs
etc etc are outside the scope of Btrfs.
Why rollback is a reboot? With copy-on-write it could be possible to
just commit tree state, which was before transaction start, as a current
one and thus rollback all changes. Having that possibility from
userspace could be a great benefit, since in case of application error
it is relly simple to undo all changes.
Oh, from a filesystem point of view it is very simple to undo changes,
especially with COW. We've got snapshots and we can pull old copies from an
old snapshot etc etc.

But, userland expects things not to be undone. Picture two procs operating in
a directory. One proc calls fsync and gets assurance from the FS that things
are on disk. The other proc calls rollback and undoes the fsync. The posix
API isn't built around this.

There are definitely cases where the admin will want to be able to run a
command to shift the FS state back to some time in the past. But, it needs
to be an admin level tool where the complex interactions between procs are
well understood (by the admin).

-chris
Evgeniy Polyakov
2008-04-23 13:15:08 UTC
Permalink
Post by Chris Mason
But, userland expects things not to be undone. Picture two procs operating in
a directory. One proc calls fsync and gets assurance from the FS that things
are on disk. The other proc calls rollback and undoes the fsync. The posix
API isn't built around this.
Rollback happens on transaction, so first application called fsync in
own trasaction, which flushed data to disk, while second thread has own
trasaction, and that data will be removed, while data written in first
transaction is still on disk.
Post by Chris Mason
There are definitely cases where the admin will want to be able to run a
command to shift the FS state back to some time in the past. But, it needs
to be an admin level tool where the complex interactions between procs are
well understood (by the admin).
Well, to allow or not to allow transaction mechanism to users is the
last question imho, from security point of view it can be limited to
admin only, although if transaction is only a label to operation, then
it can be allowed to be done for users too...
--
Evgeniy Polyakov
Chris Mason
2008-04-23 13:23:03 UTC
Permalink
On Wed, Apr 23, 2008 at 09:07:28AM -0400, Chris Mason
Post by Chris Mason
But, userland expects things not to be undone. Picture two procs
operating in a directory. One proc calls fsync and gets assurance from
the FS that things are on disk. The other proc calls rollback and undoes
the fsync. The posix API isn't built around this.
Rollback happens on transaction, so first application called fsync in
own trasaction, which flushed data to disk, while second thread has own
trasaction, and that data will be removed, while data written in first
transaction is still on disk.
The kind of logging this requires is outside the scope of Btrfs ;) It is
possible if both procs are running in different tree roots, but how about:

proc A: mkdir dir1
proc A: create dir1/file1
proc B: add data to dir1/file1
proc B: fsync dir1/file1
proc A: rollback

Filesystems can be databases, but not with the current APIs. Userland simply
isn't built around these semantics today.

-chris
Evgeniy Polyakov
2008-04-23 16:21:32 UTC
Permalink
Post by Chris Mason
The kind of logging this requires is outside the scope of Btrfs ;) It is
proc A: mkdir dir1
proc A: create dir1/file1
proc B: add data to dir1/file1
proc B: fsync dir1/file1
proc A: rollback
Depending on where transaction was started and where it was stopped. If
there are exactly two transactions started and stopped at the start and
the end of the 'trace', then rollback of transaction A means rollback of
the inner transactions too.
Post by Chris Mason
Filesystems can be databases, but not with the current APIs. Userland simply
isn't built around these semantics today.
This is a philosiphical disput, I always believed that there is no
difference between database and filesystem, but only access method
changes. And having proper API is just a matter of taste: one can create
ioctl based one to be private feature of the new filesystem. No one
argues that XFS operation system should be transformed into XFS
filesystem and generic VFS helpers (although that could be a good idea).
--
Evgeniy Polyakov
Chris Mason
2008-04-23 17:15:45 UTC
Permalink
On Wed, Apr 23, 2008 at 09:23:03AM -0400, Chris Mason
Post by Chris Mason
The kind of logging this requires is outside the scope of Btrfs ;) It is
proc A: mkdir dir1
proc A: create dir1/file1
proc B: add data to dir1/file1
proc B: fsync dir1/file1
proc A: rollback
Depending on where transaction was started and where it was stopped. If
there are exactly two transactions started and stopped at the start and
the end of the 'trace', then rollback of transaction A means rollback of
the inner transactions too.
Right, these are things that real databases solve that posix doesn't expect or
understand. The rollback will make the file disappear, but some other
process could have the file open, without a transaction running. So, the
rollback needs to provide the same semantics you get today with unlink on an
open file.

Definitely not impossible, but really outside the scope of btrfs.
Post by Chris Mason
Filesystems can be databases, but not with the current APIs. Userland
simply isn't built around these semantics today.
This is a philosiphical disput, I always believed that there is no
difference between database and filesystem, but only access method
changes.
We agree there, except that filesystems are able to include a number of
optimizations that databases can't because they don't have to do rollback or
private views of the data.
And having proper API is just a matter of taste: one can create
ioctl based one to be private feature of the new filesystem. No one
argues that XFS operation system should be transformed into XFS
filesystem and generic VFS helpers (although that could be a good idea).
It isn't just the API, it is the rules surrounding how and when files or
directories can disappear. Please, prove me wrong...patches always welcome.

-chris
Bron Gondwana
2008-04-23 23:52:08 UTC
Permalink
Post by Chris Mason
On Wed, Apr 23, 2008 at 09:07:28AM -0400, Chris Mason
Post by Chris Mason
But, userland expects things not to be undone. Picture two procs
operating in a directory. One proc calls fsync and gets assurance from
the FS that things are on disk. The other proc calls rollback and undoes
the fsync. The posix API isn't built around this.
Rollback happens on transaction, so first application called fsync in
own trasaction, which flushed data to disk, while second thread has own
trasaction, and that data will be removed, while data written in first
transaction is still on disk.
The kind of logging this requires is outside the scope of Btrfs ;) It is
proc A: mkdir dir1
proc A: create dir1/file1
proc B: add data to dir1/file1
proc B: fsync dir1/file1
proc A: rollback
Filesystems can be databases, but not with the current APIs. Userland simply
isn't built around these semantics today.
proc A: mkdir dir1
proc A: create dir1/file1
proc B: add data to dir1/file1
proc B: fsync dir1/file1
proc A: unlink dir1/file1
proc A: rmdir dir1

I don't see the difference.

Bron.
Chris Mason
2008-04-24 13:06:54 UTC
Permalink
Post by Chris Mason
Post by Chris Mason
On Wed, Apr 23, 2008 at 09:07:28AM -0400, Chris Mason
Post by Chris Mason
But, userland expects things not to be undone. Picture two procs
operating in a directory. One proc calls fsync and gets assurance
from the FS that things are on disk. The other proc calls rollback
and undoes the fsync. The posix API isn't built around this.
Rollback happens on transaction, so first application called fsync in
own trasaction, which flushed data to disk, while second thread has own
trasaction, and that data will be removed, while data written in first
transaction is still on disk.
The kind of logging this requires is outside the scope of Btrfs ;) It is
proc A: mkdir dir1
proc A: create dir1/file1
proc B: add data to dir1/file1
proc B: fsync dir1/file1
proc A: rollback
Filesystems can be databases, but not with the current APIs. Userland
simply isn't built around these semantics today.
proc A: mkdir dir1
proc A: create dir1/file1
proc B: add data to dir1/file1
proc B: fsync dir1/file1
proc A: unlink dir1/file1
proc A: rmdir dir1
I don't see the difference.
The main difference is that in the unlink case, the unlink goes through a
series of code in the VFS to make sure that open file handles stay viable and
that all of the other posix rules are followed. In the rollback case, the
filesystem has to do all of that on its own.

Here's another:

proc A: mkdir dir1
proc B: open dir1/file1 O_CREATE
proc A: rollback
proc B: close

Doing the same thing with rmdir would fail because the directory wasn't empty.
In order to provide the rollback, the FS would have to wander through all of
the dentries and do something sane with them. It could rename the directory
to dir1.soontobedead and clean it as soon as proc B was done.

The main point is this kind of thing is littered with corner cases. You'd
have to find each file or directory affected by the rollback and make sure
appropriate actions are taken for each one, and get it done in a VFS friendly
deadlock free way.

It would definitely be an interesting project. But, a much more common
feature request is the ability to do a few small things in an atomic unit
(like Ceph), and I think that is a much more realistic project for the short
term.

-chris
Bron Gondwana
2008-04-27 10:55:08 UTC
Permalink
Post by Chris Mason
Post by Chris Mason
Post by Chris Mason
proc A: mkdir dir1
proc A: create dir1/file1
proc B: add data to dir1/file1
proc B: fsync dir1/file1
proc A: rollback
Filesystems can be databases, but not with the current APIs. Userland
simply isn't built around these semantics today.
proc A: mkdir dir1
proc A: create dir1/file1
proc B: add data to dir1/file1
proc B: fsync dir1/file1
proc A: unlink dir1/file1
proc A: rmdir dir1
I don't see the difference.
The main difference is that in the unlink case, the unlink goes through a
series of code in the VFS to make sure that open file handles stay viable
and that all of the other posix rules are followed. In the rollback case,
the filesystem has to do all of that on its own.
proc A: mkdir dir1
proc B: open dir1/file1 O_CREATE
proc A: rollback
proc B: close
[... I've trimmed the following a bit, it's only partially quoted...]
Doing the same thing with rmdir would fail because the directory wasn't
empty. In order to provide the rollback, the FS would have to wander
through all of the dentries and do something sane with them....
The main point is this kind of thing is littered with corner cases.
You'd have to find each file or directory affected by the rollback
and make sure appropriate actions are taken for each one, and get
it done in a VFS friendly deadlock free way.
Yeah, that's a good point. I suspect my first pass idea for this would
look remarkably like a soft-mounted NFS drive that had been disconnected.
Ooops, your little bit of filesystem went away - EIO, byebye.

Bron.
--
Bron Gondwana
brong at fastmail.fm
Chris Mason
2008-04-22 20:32:34 UTC
Permalink
Post by Sage Weil
Hi Chris,
These ioctls let a user application hold a transaction open while it
performs a series of operations. A final ioctl does a sync on the fs
(closing the current transaction). This is the main requirement for
Ceph's OSD to be able to keep the data it's storing in a btrfs volume
consistent, and AFAICS it works just fine. The application would do
something like
I'm definitely willing to include it for you to experiment with. Holding a
transaction from userland can indeed lead to deadlock, but in this case your
userland basically owns the server anyway. I'm worried about some nasty
corner cases still, but btrfs is blissfully ignoring those right now anyway.

One problem will be operations that are basically boundless (truncating a
file, large writes). Eventually the ENOSPC support will hook into the
transaction system to make sure a given operation reserves enough free space.

With your ioctls, the "do a bunch of stuff" will need to honor the same
accounting rules as the kernel code (which don't exist yet).

I thought your original plan was to do all of this from userland (without a
kernel filesystem at all)? The btrfs progs share most of the same code with
the kernel, so with a little love to the transaction and IO subsystems, you'd
be able to use it as a library style DB.

-chris
Sage Weil
2008-04-22 20:48:53 UTC
Permalink
Post by Chris Mason
I'm definitely willing to include it for you to experiment with. Holding a
transaction from userland can indeed lead to deadlock, but in this case your
userland basically owns the server anyway. I'm worried about some nasty
corner cases still, but btrfs is blissfully ignoring those right now anyway.
One problem will be operations that are basically boundless (truncating a
file, large writes). Eventually the ENOSPC support will hook into the
transaction system to make sure a given operation reserves enough free space.
With your ioctls, the "do a bunch of stuff" will need to honor the same
accounting rules as the kernel code (which don't exist yet).
So, if the transaction start ioctl made a space reservation, and if _all_
fs ops were wrapped by such reservations, that should avoid ENOSPC, yeah?

That's doesn't really help with the memory pressure issue, though. :(
Post by Chris Mason
I thought your original plan was to do all of this from userland (without a
kernel filesystem at all)? The btrfs progs share most of the same code with
the kernel, so with a little love to the transaction and IO subsystems, you'd
be able to use it as a library style DB.
Yeah... The issue is just that "a little love" is significantly more love
than this handful of ioctls, and I'm a little wary of getting into it.
That does seem like a better long term solution, though.

sage
Loading...