Commit bea9a6d2 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2

* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2:
  ocfs2: Silence gcc warning in ocfs2_write_zero_page().
  jbd2/ocfs2: Fix block checksumming when a buffer is used in several transactions
  ocfs2/dlm: Remove BUG_ON from migration in the rare case of a down node
  ocfs2: Don't duplicate pages past i_size during CoW.
  ocfs2: tighten up strlen() checking
  ocfs2: Make xattr reflink work with new local alloc reservation.
  ocfs2: make xattr extension work with new local alloc reservation.
  ocfs2: Remove the redundant cpu_to_le64.
  ocfs2/dlm: don't access beyond bitmap size
  ocfs2: No need to zero pages past i_size.
  ocfs2: Zero the tail cluster when extending past i_size.
  ocfs2: When zero extending, do it by page.
  ocfs2: Limit default local alloc size within bitmap range.
  ocfs2: Move orphan scan work to ocfs2_wq.
  fs/ocfs2/dlm: Add missing spin_unlock
parents cd9f040d 5453258d
......@@ -297,7 +297,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
struct page *new_page;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
struct jbd2_buffer_trigger_type *triggers;
journal_t *journal = transaction->t_journal;
/*
......@@ -328,21 +327,21 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
done_copy_out = 1;
new_page = virt_to_page(jh_in->b_frozen_data);
new_offset = offset_in_page(jh_in->b_frozen_data);
triggers = jh_in->b_frozen_triggers;
} else {
new_page = jh2bh(jh_in)->b_page;
new_offset = offset_in_page(jh2bh(jh_in)->b_data);
triggers = jh_in->b_triggers;
}
mapped_data = kmap_atomic(new_page, KM_USER0);
/*
* Fire any commit trigger. Do this before checking for escaping,
* as the trigger may modify the magic offset. If a copy-out
* happens afterwards, it will have the correct data in the buffer.
* Fire data frozen trigger if data already wasn't frozen. Do this
* before checking for escaping, as the trigger may modify the magic
* offset. If a copy-out happens afterwards, it will have the correct
* data in the buffer.
*/
jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
triggers);
if (!done_copy_out)
jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
jh_in->b_triggers);
/*
* Check for escaping
......
......@@ -725,6 +725,9 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
page = jh2bh(jh)->b_page;
offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
source = kmap_atomic(page, KM_USER0);
/* Fire data frozen trigger just before we copy the data */
jbd2_buffer_frozen_trigger(jh, source + offset,
jh->b_triggers);
memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
kunmap_atomic(source, KM_USER0);
......@@ -963,15 +966,15 @@ void jbd2_journal_set_triggers(struct buffer_head *bh,
jh->b_triggers = type;
}
void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
struct jbd2_buffer_trigger_type *triggers)
{
struct buffer_head *bh = jh2bh(jh);
if (!triggers || !triggers->t_commit)
if (!triggers || !triggers->t_frozen)
return;
triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
}
void jbd2_buffer_abort_trigger(struct journal_head *jh,
......
......@@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
dump_stack();
goto bail;
}
past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
(unsigned long long)past_eof);
if (create && (iblock >= past_eof))
set_buffer_new(bh_result);
}
past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
(unsigned long long)past_eof);
if (create && (iblock >= past_eof))
set_buffer_new(bh_result);
bail:
if (err < 0)
err = -EIO;
......@@ -459,36 +458,6 @@ int walk_page_buffers( handle_t *handle,
return ret;
}
handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
struct page *page,
unsigned from,
unsigned to)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
handle_t *handle;
int ret = 0;
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
if (ocfs2_should_order_data(inode)) {
ret = ocfs2_jbd2_file_inode(handle, inode);
if (ret < 0)
mlog_errno(ret);
}
out:
if (ret) {
if (!IS_ERR(handle))
ocfs2_commit_trans(osb, handle);
handle = ERR_PTR(ret);
}
return handle;
}
static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
{
sector_t status;
......@@ -1131,23 +1100,37 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
*/
static int ocfs2_grab_pages_for_write(struct address_space *mapping,
struct ocfs2_write_ctxt *wc,
u32 cpos, loff_t user_pos, int new,
u32 cpos, loff_t user_pos,
unsigned user_len, int new,
struct page *mmap_page)
{
int ret = 0, i;
unsigned long start, target_index, index;
unsigned long start, target_index, end_index, index;
struct inode *inode = mapping->host;
loff_t last_byte;
target_index = user_pos >> PAGE_CACHE_SHIFT;
/*
* Figure out how many pages we'll be manipulating here. For
* non allocating write, we just change the one
* page. Otherwise, we'll need a whole clusters worth.
* page. Otherwise, we'll need a whole clusters worth. If we're
* writing past i_size, we only need enough pages to cover the
* last page of the write.
*/
if (new) {
wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
/*
* We need the index *past* the last page we could possibly
* touch. This is the page past the end of the write or
* i_size, whichever is greater.
*/
last_byte = max(user_pos + user_len, i_size_read(inode));
BUG_ON(last_byte < 1);
end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
if ((start + wc->w_num_pages) > end_index)
wc->w_num_pages = end_index - start;
} else {
wc->w_num_pages = 1;
start = target_index;
......@@ -1620,21 +1603,20 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
* write path can treat it as an non-allocating write, which has no
* special case code for sparse/nonsparse files.
*/
static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
unsigned len,
static int ocfs2_expand_nonsparse_inode(struct inode *inode,
struct buffer_head *di_bh,
loff_t pos, unsigned len,
struct ocfs2_write_ctxt *wc)
{
int ret;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
loff_t newsize = pos + len;
if (ocfs2_sparse_alloc(osb))
return 0;
BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
if (newsize <= i_size_read(inode))
return 0;
ret = ocfs2_extend_no_holes(inode, newsize, pos);
ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
if (ret)
mlog_errno(ret);
......@@ -1644,6 +1626,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
return ret;
}
static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
loff_t pos)
{
int ret = 0;
BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
if (pos > i_size_read(inode))
ret = ocfs2_zero_extend(inode, di_bh, pos);
return ret;
}
int ocfs2_write_begin_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
......@@ -1679,7 +1673,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
}
}
ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
if (ocfs2_sparse_alloc(osb))
ret = ocfs2_zero_tail(inode, di_bh, pos);
else
ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
wc);
if (ret) {
mlog_errno(ret);
goto out;
......@@ -1789,7 +1787,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
* that we can zero and flush if we error after adding the
* extent.
*/
ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
cluster_of_pages, mmap_page);
if (ret) {
mlog_errno(ret);
......
......@@ -1671,7 +1671,7 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
struct dlm_ctxt *dlm = NULL;
struct dlm_ctxt *new_ctxt = NULL;
if (strlen(domain) > O2NM_MAX_NAME_LEN) {
if (strlen(domain) >= O2NM_MAX_NAME_LEN) {
ret = -ENAMETOOLONG;
mlog(ML_ERROR, "domain name length too long\n");
goto leave;
......@@ -1709,6 +1709,7 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
}
if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) {
spin_unlock(&dlm_domain_lock);
mlog(ML_ERROR,
"Requested locking protocol version is not "
"compatible with already registered domain "
......
......@@ -2808,14 +2808,8 @@ static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
mlog(0, "trying again...\n");
goto again;
}
/* now that we are sure the MIGRATING state is there, drop
* the unneded state which blocked threads trying to DIRTY */
spin_lock(&res->spinlock);
BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
spin_unlock(&res->spinlock);
ret = 0;
/* did the target go down or die? */
spin_lock(&dlm->spinlock);
if (!test_bit(target, dlm->domain_map)) {
......@@ -2825,10 +2819,22 @@ static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
}
spin_unlock(&dlm->spinlock);
/*
* if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
* another try; otherwise, we are sure the MIGRATING state is there,
* drop the unneded state which blocked threads trying to DIRTY
*/
spin_lock(&res->spinlock);
BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
if (!ret)
BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
spin_unlock(&res->spinlock);
/*
* at this point:
*
* o the DLM_LOCK_RES_MIGRATING flag is set
* o the DLM_LOCK_RES_MIGRATING flag is set if target not down
* o there are no pending asts on this lockres
* o all processes trying to reserve an ast on this
* lockres must wait for the MIGRATING flag to clear
......
......@@ -463,7 +463,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
int bit;
bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit >= O2NM_MAX_NODES || bit < 0)
dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
else
......
......@@ -724,28 +724,55 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
return status;
}
/*
* While a write will already be ordering the data, a truncate will not.
* Thus, we need to explicitly order the zeroed pages.
*/
static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
handle_t *handle = NULL;
int ret = 0;
if (!ocfs2_should_order_data(inode))
goto out;
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
ret = ocfs2_jbd2_file_inode(handle, inode);
if (ret < 0)
mlog_errno(ret);
out:
if (ret) {
if (!IS_ERR(handle))
ocfs2_commit_trans(osb, handle);
handle = ERR_PTR(ret);
}
return handle;
}
/* Some parts of this taken from generic_cont_expand, which turned out
* to be too fragile to do exactly what we need without us having to
* worry about recursive locking in ->write_begin() and ->write_end(). */
static int ocfs2_write_zero_page(struct inode *inode,
u64 size)
static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
u64 abs_to)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
unsigned long index;
unsigned int offset;
unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
handle_t *handle = NULL;
int ret;
int ret = 0;
unsigned zero_from, zero_to, block_start, block_end;
offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
/* ugh. in prepare/commit_write, if from==to==start of block, we
** skip the prepare. make sure we never send an offset for the start
** of a block
*/
if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
offset++;
}
index = size >> PAGE_CACHE_SHIFT;
BUG_ON(abs_from >= abs_to);
BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
BUG_ON(abs_from & (inode->i_blkbits - 1));
page = grab_cache_page(mapping, index);
if (!page) {
......@@ -754,31 +781,56 @@ static int ocfs2_write_zero_page(struct inode *inode,
goto out;
}
ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
if (ret < 0) {
mlog_errno(ret);
goto out_unlock;
}
/* Get the offsets within the page that we want to zero */
zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
if (!zero_to)
zero_to = PAGE_CACHE_SIZE;
if (ocfs2_should_order_data(inode)) {
handle = ocfs2_start_walk_page_trans(inode, page, offset,
offset);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
handle = NULL;
mlog(0,
"abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
(unsigned long long)abs_from, (unsigned long long)abs_to,
index, zero_from, zero_to);
/* We know that zero_from is block aligned */
for (block_start = zero_from; block_start < zero_to;
block_start = block_end) {
block_end = block_start + (1 << inode->i_blkbits);
/*
* block_start is block-aligned. Bump it by one to
* force ocfs2_{prepare,commit}_write() to zero the
* whole block.
*/
ret = ocfs2_prepare_write_nolock(inode, page,
block_start + 1,
block_start + 1);
if (ret < 0) {
mlog_errno(ret);
goto out_unlock;
}
}
/* must not update i_size! */
ret = block_commit_write(page, offset, offset);
if (ret < 0)
mlog_errno(ret);
else
ret = 0;
if (!handle) {
handle = ocfs2_zero_start_ordered_transaction(inode);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
handle = NULL;
break;
}
}
/* must not update i_size! */
ret = block_commit_write(page, block_start + 1,
block_start + 1);
if (ret < 0)
mlog_errno(ret);
else
ret = 0;
}
if (handle)
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out_unlock:
unlock_page(page);
page_cache_release(page);
......@@ -786,22 +838,114 @@ static int ocfs2_write_zero_page(struct inode *inode,
return ret;
}
static int ocfs2_zero_extend(struct inode *inode,
u64 zero_to_size)
/*
* Find the next range to zero. We do this in terms of bytes because
* that's what ocfs2_zero_extend() wants, and it is dealing with the
* pagecache. We may return multiple extents.
*
* zero_start and zero_end are ocfs2_zero_extend()s current idea of what
* needs to be zeroed. range_start and range_end return the next zeroing
* range. A subsequent call should pass the previous range_end as its
* zero_start. If range_end is 0, there's nothing to do.
*
* Unwritten extents are skipped over. Refcounted extents are CoWd.
*/
static int ocfs2_zero_extend_get_range(struct inode *inode,
struct buffer_head *di_bh,
u64 zero_start, u64 zero_end,
u64 *range_start, u64 *range_end)
{
int ret = 0;
u64 start_off;
struct super_block *sb = inode->i_sb;
int rc = 0, needs_cow = 0;
u32 p_cpos, zero_clusters = 0;
u32 zero_cpos =
zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
unsigned int num_clusters = 0;
unsigned int ext_flags = 0;
start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
while (start_off < zero_to_size) {
ret = ocfs2_write_zero_page(inode, start_off);
if (ret < 0) {
mlog_errno(ret);
while (zero_cpos < last_cpos) {
rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
&num_clusters, &ext_flags);
if (rc) {
mlog_errno(rc);
goto out;
}
if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
zero_clusters = num_clusters;
if (ext_flags & OCFS2_EXT_REFCOUNTED)
needs_cow = 1;
break;
}
zero_cpos += num_clusters;
}
if (!zero_clusters) {
*range_end = 0;
goto out;
}
while ((zero_cpos + zero_clusters) < last_cpos) {
rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
&p_cpos, &num_clusters,
&ext_flags);
if (rc) {
mlog_errno(rc);
goto out;
}
start_off += sb->s_blocksize;
if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
break;
if (ext_flags & OCFS2_EXT_REFCOUNTED)
needs_cow = 1;
zero_clusters += num_clusters;
}
if ((zero_cpos + zero_clusters) > last_cpos)
zero_clusters = last_cpos - zero_cpos;
if (needs_cow) {
rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
UINT_MAX);
if (rc) {
mlog_errno(rc);
goto out;
}
}
*range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
*range_end = ocfs2_clusters_to_bytes(inode->i_sb,
zero_cpos + zero_clusters);
out:
return rc;
}
/*
* Zero one range returned from ocfs2_zero_extend_get_range(). The caller
* has made sure that the entire range needs zeroing.
*/
static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
u64 range_end)
{
int rc = 0;
u64 next_pos;
u64 zero_pos = range_start;
mlog(0, "range_start = %llu, range_end = %llu\n",
(unsigned long long)range_start,
(unsigned long long)range_end);
BUG_ON(range_start >= range_end);
while (zero_pos < range_end) {
next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
if (next_pos > range_end)
next_pos = range_end;
rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
if (rc < 0) {
mlog_errno(rc);
break;
}
zero_pos = next_pos;
/*
* Very large extends have the potential to lock up
......@@ -810,16 +954,63 @@ static int ocfs2_zero_extend(struct inode *inode,
cond_resched();
}
out:
return rc;
}
int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
loff_t zero_to_size)
{
int ret = 0;
u64 zero_start, range_start = 0, range_end = 0;
struct super_block *sb = inode->i_sb;
zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
mlog(0, "zero_start %llu for i_size %llu\n",
(unsigned long long)zero_start,
(unsigned long long)i_size_read(inode));
while (zero_start < zero_to_size) {
ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
zero_to_size,
&range_start,
&range_end);
if (ret) {
mlog_errno(ret);
break;
}
if (!range_end)
break;
/* Trim the ends */
if (range_start < zero_start)
range_start = zero_start;
if (range_end > zero_to_size)
range_end = zero_to_size;
ret = ocfs2_zero_extend_range(inode, range_start,
range_end);
if (ret) {
mlog_errno(ret);
break;
}
zero_start = range_end;
}
return ret;