s3: Optimize the write cache for sequential writes
[kamenim/samba.git] / source3 / smbd / fileio.c
1 /* 
2    Unix SMB/Netbios implementation.
3    Version 1.9.
4    read/write to a files_struct
5    Copyright (C) Andrew Tridgell 1992-1998
6    Copyright (C) Jeremy Allison 2000-2002. - write cache.
7    
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program.  If not, see <http://www.gnu.org/licenses/>.
20 */
21
22 #include "includes.h"
23 #include "smbd/globals.h"
24
25 static bool setup_write_cache(files_struct *, SMB_OFF_T);
26
27 /****************************************************************************
28  Read from write cache if we can.
29 ****************************************************************************/
30
31 static bool read_from_write_cache(files_struct *fsp,char *data,SMB_OFF_T pos,size_t n)
32 {
33         write_cache *wcp = fsp->wcp;
34
35         if(!wcp) {
36                 return False;
37         }
38
39         if( n > wcp->data_size || pos < wcp->offset || pos + n > wcp->offset + wcp->data_size) {
40                 return False;
41         }
42
43         memcpy(data, wcp->data + (pos - wcp->offset), n);
44
45         DO_PROFILE_INC(writecache_read_hits);
46
47         return True;
48 }
49
50 /****************************************************************************
51  Read from a file.
52 ****************************************************************************/
53
54 ssize_t read_file(files_struct *fsp,char *data,SMB_OFF_T pos,size_t n)
55 {
56         ssize_t ret=0,readret;
57
58         /* you can't read from print files */
59         if (fsp->print_file) {
60                 errno = EBADF;
61                 return -1;
62         }
63
64         /*
65          * Serve from write cache if we can.
66          */
67
68         if(read_from_write_cache(fsp, data, pos, n)) {
69                 fsp->fh->pos = pos + n;
70                 fsp->fh->position_information = fsp->fh->pos;
71                 return n;
72         }
73
74         flush_write_cache(fsp, READ_FLUSH);
75
76         fsp->fh->pos = pos;
77
78         if (n > 0) {
79 #ifdef DMF_FIX
80                 int numretries = 3;
81 tryagain:
82                 readret = SMB_VFS_PREAD(fsp,data,n,pos);
83
84                 if (readret == -1) {
85                         if ((errno == EAGAIN) && numretries) {
86                                 DEBUG(3,("read_file EAGAIN retry in 10 seconds\n"));
87                                 (void)sleep(10);
88                                 --numretries;
89                                 goto tryagain;
90                         }
91                         return -1;
92                 }
93 #else /* NO DMF fix. */
94                 readret = SMB_VFS_PREAD(fsp,data,n,pos);
95
96                 if (readret == -1) {
97                         return -1;
98                 }
99 #endif
100                 if (readret > 0) {
101                         ret += readret;
102                 }
103         }
104
105         DEBUG(10,("read_file (%s): pos = %.0f, size = %lu, returned %lu\n",
106                   fsp_str_dbg(fsp), (double)pos, (unsigned long)n, (long)ret));
107
108         fsp->fh->pos += ret;
109         fsp->fh->position_information = fsp->fh->pos;
110
111         return(ret);
112 }
113
114 /****************************************************************************
115  *Really* write to a file.
116 ****************************************************************************/
117
118 static ssize_t real_write_file(struct smb_request *req,
119                                 files_struct *fsp,
120                                 const char *data,
121                                 SMB_OFF_T pos,
122                                 size_t n)
123 {
124         ssize_t ret;
125
126         if (pos == -1) {
127                 ret = vfs_write_data(req, fsp, data, n);
128         } else {
129                 fsp->fh->pos = pos;
130                 if (pos && lp_strict_allocate(SNUM(fsp->conn))) {
131                         if (vfs_fill_sparse(fsp, pos) == -1) {
132                                 return -1;
133                         }
134                 }
135                 ret = vfs_pwrite_data(req, fsp, data, n, pos);
136         }
137
138         DEBUG(10,("real_write_file (%s): pos = %.0f, size = %lu, returned %ld\n",
139                   fsp_str_dbg(fsp), (double)pos, (unsigned long)n, (long)ret));
140
141         if (ret != -1) {
142                 fsp->fh->pos += ret;
143
144 /* Yes - this is correct - writes don't update this. JRA. */
145 /* Found by Samba4 tests. */
146 #if 0
147                 fsp->position_information = fsp->pos;
148 #endif
149         }
150
151         return ret;
152 }
153
154 /****************************************************************************
155  File size cache change.
156  Updates size on disk but doesn't flush the cache.
157 ****************************************************************************/
158
159 static int wcp_file_size_change(files_struct *fsp)
160 {
161         int ret;
162         write_cache *wcp = fsp->wcp;
163
164         wcp->file_size = wcp->offset + wcp->data_size;
165         ret = SMB_VFS_FTRUNCATE(fsp, wcp->file_size);
166         if (ret == -1) {
167                 DEBUG(0,("wcp_file_size_change (%s): ftruncate of size %.0f "
168                          "error %s\n", fsp_str_dbg(fsp),
169                          (double)wcp->file_size, strerror(errno)));
170         }
171         return ret;
172 }
173
174 void update_write_time_handler(struct event_context *ctx,
175                                       struct timed_event *te,
176                                       struct timeval now,
177                                       void *private_data)
178 {
179         files_struct *fsp = (files_struct *)private_data;
180
181         DEBUG(5, ("Update write time on %s\n", fsp_str_dbg(fsp)));
182
183         /* change the write time in the open file db. */
184         (void)set_write_time(fsp->file_id, timespec_current());
185
186         /* And notify. */
187         notify_fname(fsp->conn, NOTIFY_ACTION_MODIFIED,
188                      FILE_NOTIFY_CHANGE_LAST_WRITE, fsp->fsp_name->base_name);
189
190         /* Remove the timed event handler. */
191         TALLOC_FREE(fsp->update_write_time_event);
192 }
193
194 /*********************************************************
195  Schedule a write time update for WRITE_TIME_UPDATE_USEC_DELAY
196  in the future.
197 *********************************************************/
198
199 void trigger_write_time_update(struct files_struct *fsp)
200 {
201         int delay;
202
203         if (fsp->posix_open) {
204                 /* Don't use delayed writes on POSIX files. */
205                 return;
206         }
207
208         if (fsp->write_time_forced) {
209                 /* No point - "sticky" write times
210                  * in effect.
211                  */
212                 return;
213         }
214
215         /* We need to remember someone did a write
216          * and update to current time on close. */
217
218         fsp->update_write_time_on_close = true;
219
220         if (fsp->update_write_time_triggered) {
221                 /*
222                  * We only update the write time after 2 seconds
223                  * on the first normal write. After that
224                  * no other writes affect this until close.
225                  */
226                 return;
227         }
228         fsp->update_write_time_triggered = true;
229
230         delay = lp_parm_int(SNUM(fsp->conn),
231                             "smbd", "writetimeupdatedelay",
232                             WRITE_TIME_UPDATE_USEC_DELAY);
233
234         DEBUG(5, ("Update write time %d usec later on %s\n",
235                   delay, fsp_str_dbg(fsp)));
236
237         /* trigger the update 2 seconds later */
238         fsp->update_write_time_event =
239                 event_add_timed(smbd_event_context(), NULL,
240                                 timeval_current_ofs(0, delay),
241                                 update_write_time_handler, fsp);
242 }
243
244 void trigger_write_time_update_immediate(struct files_struct *fsp)
245 {
246         struct smb_file_time ft;
247
248         if (fsp->posix_open) {
249                 /* Don't use delayed writes on POSIX files. */
250                 return;
251         }
252
253         if (fsp->write_time_forced) {
254                 /*
255                  * No point - "sticky" write times
256                  * in effect.
257                  */
258                 return;
259         }
260
261         TALLOC_FREE(fsp->update_write_time_event);
262         DEBUG(5, ("Update write time immediate on %s\n",
263                   fsp_str_dbg(fsp)));
264
265         /* After an immediate update, reset the trigger. */
266         fsp->update_write_time_triggered = true;
267         fsp->update_write_time_on_close = false;
268
269         ZERO_STRUCT(ft);
270         ft.mtime = timespec_current();
271
272         /* Update the time in the open file db. */
273         (void)set_write_time(fsp->file_id, ft.mtime);
274
275         /* Now set on disk - takes care of notify. */
276         (void)smb_set_file_time(fsp->conn, fsp, fsp->fsp_name, &ft, false);
277 }
278
279 /****************************************************************************
280  Write to a file.
281 ****************************************************************************/
282
283 ssize_t write_file(struct smb_request *req,
284                         files_struct *fsp,
285                         const char *data,
286                         SMB_OFF_T pos,
287                         size_t n)
288 {
289         write_cache *wcp = fsp->wcp;
290         ssize_t total_written = 0;
291         int write_path = -1;
292
293         if (fsp->print_file) {
294                 uint32 jobid;
295
296                 if (!rap_to_pjobid(fsp->print_file->rap_jobid, NULL, &jobid)) {
297                         DEBUG(3, ("write_file: "
298                                   "Unable to map RAP jobid %u to jobid.\n",
299                                   (unsigned int)fsp->print_file->rap_jobid));
300                         errno = EBADF;
301                         return -1;
302                 }
303
304                 /* support seeks for print files bigger than 4G */
305                 pos = printfile_offset(fsp, pos);
306
307                 return print_job_write(SNUM(fsp->conn), jobid, data, pos, n);
308         }
309
310         if (!fsp->can_write) {
311                 errno = EPERM;
312                 return -1;
313         }
314
315         if (!fsp->modified) {
316                 fsp->modified = True;
317
318                 if (SMB_VFS_FSTAT(fsp, &fsp->fsp_name->st) == 0) {
319                         int dosmode;
320                         trigger_write_time_update(fsp);
321                         dosmode = dos_mode(fsp->conn, fsp->fsp_name);
322                         if ((lp_store_dos_attributes(SNUM(fsp->conn)) ||
323                                         MAP_ARCHIVE(fsp->conn)) &&
324                                         !IS_DOS_ARCHIVE(dosmode)) {
325                                 file_set_dosmode(fsp->conn, fsp->fsp_name,
326                                                  dosmode | aARCH, NULL, false);
327                         }
328
329                         /*
330                          * If this is the first write and we have an exclusive oplock then setup
331                          * the write cache.
332                          */
333
334                         if (EXCLUSIVE_OPLOCK_TYPE(fsp->oplock_type) && !wcp) {
335                                 setup_write_cache(fsp,
336                                                  fsp->fsp_name->st.st_ex_size);
337                                 wcp = fsp->wcp;
338                         }
339                 }
340         }
341
342 #ifdef WITH_PROFILE
343         DO_PROFILE_INC(writecache_total_writes);
344         if (!fsp->oplock_type) {
345                 DO_PROFILE_INC(writecache_non_oplock_writes);
346         }
347 #endif
348
349         /*
350          * If this file is level II oplocked then we need
351          * to grab the shared memory lock and inform all
352          * other files with a level II lock that they need
353          * to flush their read caches. We keep the lock over
354          * the shared memory area whilst doing this.
355          */
356
357         /* This should actually be improved to span the write. */
358         contend_level2_oplocks_begin(fsp, LEVEL2_CONTEND_WRITE);
359         contend_level2_oplocks_end(fsp, LEVEL2_CONTEND_WRITE);
360
361 #ifdef WITH_PROFILE
362         if (profile_p && profile_p->writecache_total_writes % 500 == 0) {
363                 DEBUG(3,("WRITECACHE: initwrites=%u abutted=%u total=%u \
364 nonop=%u allocated=%u active=%u direct=%u perfect=%u readhits=%u\n",
365                         profile_p->writecache_init_writes,
366                         profile_p->writecache_abutted_writes,
367                         profile_p->writecache_total_writes,
368                         profile_p->writecache_non_oplock_writes,
369                         profile_p->writecache_allocated_write_caches,
370                         profile_p->writecache_num_write_caches,
371                         profile_p->writecache_direct_writes,
372                         profile_p->writecache_num_perfect_writes,
373                         profile_p->writecache_read_hits ));
374
375                 DEBUG(3,("WRITECACHE: Flushes SEEK=%d, READ=%d, WRITE=%d, READRAW=%d, OPLOCK=%d, CLOSE=%d, SYNC=%d\n",
376                         profile_p->writecache_flushed_writes[SEEK_FLUSH],
377                         profile_p->writecache_flushed_writes[READ_FLUSH],
378                         profile_p->writecache_flushed_writes[WRITE_FLUSH],
379                         profile_p->writecache_flushed_writes[READRAW_FLUSH],
380                         profile_p->writecache_flushed_writes[OPLOCK_RELEASE_FLUSH],
381                         profile_p->writecache_flushed_writes[CLOSE_FLUSH],
382                         profile_p->writecache_flushed_writes[SYNC_FLUSH] ));
383         }
384 #endif
385
386         if (wcp && req->unread_bytes) {
387                 /* If we're using receivefile don't
388                  * deal with a write cache.
389                  */
390                 flush_write_cache(fsp, WRITE_FLUSH);
391                 delete_write_cache(fsp);
392                 wcp = NULL;
393         }
394
395         if(!wcp) {
396                 DO_PROFILE_INC(writecache_direct_writes);
397                 total_written = real_write_file(req, fsp, data, pos, n);
398                 return total_written;
399         }
400
401         DEBUG(9,("write_file (%s)(fd=%d pos=%.0f size=%u) wcp->offset=%.0f "
402                  "wcp->data_size=%u\n", fsp_str_dbg(fsp), fsp->fh->fd,
403                  (double)pos, (unsigned int)n, (double)wcp->offset,
404                  (unsigned int)wcp->data_size));
405
406         fsp->fh->pos = pos + n;
407
408         /*
409          * If we have active cache and it isn't contiguous then we flush.
410          * NOTE: There is a small problem with running out of disk ....
411          */
412
413         if (wcp->data_size) {
414                 bool cache_flush_needed = False;
415
416                 if ((pos >= wcp->offset) && (pos <= wcp->offset + wcp->data_size)) {
417       
418                         /* ASCII art.... JRA.
419
420       +--------------+-----
421       | Cached data  | Rest of allocated cache buffer....
422       +--------------+-----
423
424             +-------------------+
425             | Data to write     |
426             +-------------------+
427
428                         */
429
430                         /*
431                          * Start of write overlaps or abutts the existing data.
432                          */
433
434                         size_t data_used = MIN((wcp->alloc_size - (pos - wcp->offset)), n);
435
436                         memcpy(wcp->data + (pos - wcp->offset), data, data_used);
437
438                         /*
439                          * Update the current buffer size with the new data.
440                          */
441
442                         if(pos + data_used > wcp->offset + wcp->data_size) {
443                                 wcp->data_size = pos + data_used - wcp->offset;
444                         }
445
446                         /*
447                          * Update the file size if changed.
448                          */
449
450                         if (wcp->offset + wcp->data_size > wcp->file_size) {
451                                 if (wcp_file_size_change(fsp) == -1) {
452                                         return -1;
453                                 }
454                         }
455
456                         /*
457                          * If we used all the data then
458                          * return here.
459                          */
460
461                         if(n == data_used) {
462                                 return n;
463                         } else {
464                                 cache_flush_needed = True;
465                         }
466                         /*
467                          * Move the start of data forward by the amount used,
468                          * cut down the amount left by the same amount.
469                          */
470
471                         data += data_used;
472                         pos += data_used;
473                         n -= data_used;
474
475                         DO_PROFILE_INC(writecache_abutted_writes);
476                         total_written = data_used;
477
478                         write_path = 1;
479
480                 } else if ((pos < wcp->offset) && (pos + n > wcp->offset) && 
481                                         (pos + n <= wcp->offset + wcp->alloc_size)) {
482
483                         /* ASCII art.... JRA.
484
485                         +---------------+
486                         | Cache buffer  |
487                         +---------------+
488
489             +-------------------+
490             | Data to write     |
491             +-------------------+
492
493                         */
494
495                         /*
496                          * End of write overlaps the existing data.
497                          */
498
499                         size_t data_used = pos + n - wcp->offset;
500
501                         memcpy(wcp->data, data + n - data_used, data_used);
502
503                         /*
504                          * Update the current buffer size with the new data.
505                          */
506
507                         if(pos + n > wcp->offset + wcp->data_size) {
508                                 wcp->data_size = pos + n - wcp->offset;
509                         }
510
511                         /*
512                          * Update the file size if changed.
513                          */
514
515                         if (wcp->offset + wcp->data_size > wcp->file_size) {
516                                 if (wcp_file_size_change(fsp) == -1) {
517                                         return -1;
518                                 }
519                         }
520
521                         /*
522                          * We don't need to move the start of data, but we
523                          * cut down the amount left by the amount used.
524                          */
525
526                         n -= data_used;
527
528                         /*
529                          * We cannot have used all the data here.
530                          */
531
532                         cache_flush_needed = True;
533
534                         DO_PROFILE_INC(writecache_abutted_writes);
535                         total_written = data_used;
536
537                         write_path = 2;
538
539                 } else if ( (pos >= wcp->file_size) && 
540                                         (wcp->offset + wcp->data_size == wcp->file_size) &&
541                                         (pos > wcp->offset + wcp->data_size) && 
542                                         (pos < wcp->offset + wcp->alloc_size) ) {
543
544                         /* ASCII art.... JRA.
545
546                        End of file ---->|
547
548                         +---------------+---------------+
549                         | Cached data   | Cache buffer  |
550                         +---------------+---------------+
551
552                                               +-------------------+
553                                               | Data to write     |
554                                               +-------------------+
555
556                         */
557
558                         /*
559                          * Non-contiguous write part of which fits within
560                          * the cache buffer and is extending the file
561                          * and the cache contents reflect the current
562                          * data up to the current end of the file.
563                          */
564
565                         size_t data_used;
566
567                         if(pos + n <= wcp->offset + wcp->alloc_size) {
568                                 data_used = n;
569                         } else {
570                                 data_used = wcp->offset + wcp->alloc_size - pos;
571                         }
572
573                         /*
574                          * Fill in the non-continuous area with zeros.
575                          */
576
577                         memset(wcp->data + wcp->data_size, '\0',
578                                 pos - (wcp->offset + wcp->data_size) );
579
580                         memcpy(wcp->data + (pos - wcp->offset), data, data_used);
581
582                         /*
583                          * Update the current buffer size with the new data.
584                          */
585
586                         if(pos + data_used > wcp->offset + wcp->data_size) {
587                                 wcp->data_size = pos + data_used - wcp->offset;
588                         }
589
590                         /*
591                          * Update the file size if changed.
592                          */
593
594                         if (wcp->offset + wcp->data_size > wcp->file_size) {
595                                 if (wcp_file_size_change(fsp) == -1) {
596                                         return -1;
597                                 }
598                         }
599
600                         /*
601                          * If we used all the data then
602                          * return here.
603                          */
604
605                         if(n == data_used) {
606                                 return n;
607                         } else {
608                                 cache_flush_needed = True;
609                         }
610
611                         /*
612                          * Move the start of data forward by the amount used,
613                          * cut down the amount left by the same amount.
614                          */
615
616                         data += data_used;
617                         pos += data_used;
618                         n -= data_used;
619
620                         DO_PROFILE_INC(writecache_abutted_writes);
621                         total_written = data_used;
622
623                         write_path = 3;
624
625                 } else if ( (pos >= wcp->file_size) &&
626                             (n == 1) &&
627                             (wcp->file_size == wcp->offset + wcp->data_size) &&
628                             (pos < wcp->file_size + wcp->alloc_size)) {
629
630                         /*
631
632                 End of file ---->|
633
634                  +---------------+---------------+
635                  | Cached data   | Cache buffer  |
636                  +---------------+---------------+
637
638                                  |<------- allocated size ---------------->|
639
640                                                          +--------+
641                                                          | 1 Byte |
642                                                          +--------+
643
644                         MS-Office seems to do this a lot to determine if there's enough
645                         space on the filesystem to write a new file.
646
647                         Change to :
648
649                 End of file ---->|
650                                  +-----------------------+--------+
651                                  | Zeroed Cached data    | 1 Byte |
652                                  +-----------------------+--------+
653                         */
654
655                         flush_write_cache(fsp, WRITE_FLUSH);
656                         wcp->offset = wcp->file_size;
657                         wcp->data_size = pos - wcp->file_size + 1;
658                         memset(wcp->data, '\0', wcp->data_size);
659                         memcpy(wcp->data + wcp->data_size-1, data, 1);
660
661                         /*
662                          * Update the file size if changed.
663                          */
664
665                         if (wcp->offset + wcp->data_size > wcp->file_size) {
666                                 if (wcp_file_size_change(fsp) == -1) {
667                                         return -1;
668                                 }
669                         }
670
671                         return n;
672
673                 } else {
674
675                         /* ASCII art..... JRA.
676
677    Case 1).
678
679                         +---------------+---------------+
680                         | Cached data   | Cache buffer  |
681                         +---------------+---------------+
682
683                                                               +-------------------+
684                                                               | Data to write     |
685                                                               +-------------------+
686
687    Case 2).
688
689                            +---------------+---------------+
690                            | Cached data   | Cache buffer  |
691                            +---------------+---------------+
692
693    +-------------------+
694    | Data to write     |
695    +-------------------+
696
697     Case 3).
698
699                            +---------------+---------------+
700                            | Cached data   | Cache buffer  |
701                            +---------------+---------------+
702
703                   +-----------------------------------------------------+
704                   | Data to write                                       |
705                   +-----------------------------------------------------+
706
707                   */
708
709                         /*
710                          * Write is bigger than buffer, or there is no overlap on the
711                          * low or high ends.
712                          */
713
714                         DEBUG(9,("write_file: non cacheable write : fd = %d, pos = %.0f, len = %u, current cache pos = %.0f \
715 len = %u\n",fsp->fh->fd, (double)pos, (unsigned int)n, (double)wcp->offset, (unsigned int)wcp->data_size ));
716
717                         /*
718                          * If write would fit in the cache, and is larger than
719                          * the data already in the cache, flush the cache and
720                          * preferentially copy the data new data into it. Otherwise
721                          * just write the data directly.
722                          */
723
724                         if ( n <= wcp->alloc_size && n > wcp->data_size) {
725                                 cache_flush_needed = True;
726                         } else {
727                                 ssize_t ret = real_write_file(NULL,fsp, data, pos, n);
728
729                                 /*
730                                  * If the write overlaps the entire cache, then
731                                  * discard the current contents of the cache.
732                                  * Fix from Rasmus Borup Hansen rbh@math.ku.dk.
733                                  */
734
735                                 if ((pos <= wcp->offset) &&
736                                                 (pos + n >= wcp->offset + wcp->data_size) ) {
737                                         DEBUG(9,("write_file: discarding overwritten write \
738 cache: fd = %d, off=%.0f, size=%u\n", fsp->fh->fd, (double)wcp->offset, (unsigned int)wcp->data_size ));
739                                         wcp->data_size = 0;
740                                 }
741
742                                 DO_PROFILE_INC(writecache_direct_writes);
743                                 if (ret == -1) {
744                                         return ret;
745                                 }
746
747                                 if (pos + ret > wcp->file_size) {
748                                         wcp->file_size = pos + ret;
749                                 }
750
751                                 return ret;
752                         }
753
754                         write_path = 4;
755
756                 }
757
758                 if (cache_flush_needed) {
759                         DEBUG(3,("WRITE_FLUSH:%d: due to noncontinuous write: fd = %d, size = %.0f, pos = %.0f, \
760 n = %u, wcp->offset=%.0f, wcp->data_size=%u\n",
761                                 write_path, fsp->fh->fd, (double)wcp->file_size, (double)pos, (unsigned int)n,
762                                 (double)wcp->offset, (unsigned int)wcp->data_size ));
763
764                         flush_write_cache(fsp, WRITE_FLUSH);
765                 }
766         }
767
768         /*
769          * If the write request is bigger than the cache
770          * size, write it all out.
771          */
772
773         if (n > wcp->alloc_size ) {
774                 ssize_t ret = real_write_file(NULL,fsp, data, pos, n);
775                 if (ret == -1) {
776                         return -1;
777                 }
778
779                 if (pos + ret > wcp->file_size) {
780                         wcp->file_size = pos + n;
781                 }
782
783                 DO_PROFILE_INC(writecache_direct_writes);
784                 return total_written + n;
785         }
786
787         /*
788          * If there's any data left, cache it.
789          */
790
791         if (n) {
792 #ifdef WITH_PROFILE
793                 if (wcp->data_size) {
794                         DO_PROFILE_INC(writecache_abutted_writes);
795                 } else {
796                         DO_PROFILE_INC(writecache_init_writes);
797                 }
798 #endif
799
800                 if ((wcp->data_size == 0)
801                     && (pos > wcp->file_size)
802                     && (pos + n <= wcp->file_size + wcp->alloc_size)) {
803                         /*
804                          * This is a write completely beyond the
805                          * current EOF, but within reach of the write
806                          * cache. We expect fill-up writes pretty
807                          * soon, so it does not make sense to start
808                          * the write cache at the current
809                          * offset. These fill-up writes would trigger
810                          * separate pwrites or even unnecessary cache
811                          * flushes because they overlap if this is a
812                          * one-byte allocating write.
813                          */
814                         wcp->offset = wcp->file_size;
815                         wcp->data_size = pos - wcp->file_size;
816                         memset(wcp->data, 0, wcp->data_size);
817                 }
818
819                 memcpy(wcp->data+wcp->data_size, data, n);
820                 if (wcp->data_size == 0) {
821                         wcp->offset = pos;
822                         DO_PROFILE_INC(writecache_num_write_caches);
823                 }
824                 wcp->data_size += n;
825
826                 /*
827                  * Update the file size if changed.
828                  */
829
830                 if (wcp->offset + wcp->data_size > wcp->file_size) {
831                         if (wcp_file_size_change(fsp) == -1) {
832                                 return -1;
833                         }
834                 }
835                 DEBUG(9,("wcp->offset = %.0f wcp->data_size = %u cache return %u\n",
836                         (double)wcp->offset, (unsigned int)wcp->data_size, (unsigned int)n));
837
838                 total_written += n;
839                 return total_written; /* .... that's a write :) */
840         }
841   
842         return total_written;
843 }
844
845 /****************************************************************************
846  Delete the write cache structure.
847 ****************************************************************************/
848
849 void delete_write_cache(files_struct *fsp)
850 {
851         write_cache *wcp;
852
853         if(!fsp) {
854                 return;
855         }
856
857         if(!(wcp = fsp->wcp)) {
858                 return;
859         }
860
861         DO_PROFILE_DEC(writecache_allocated_write_caches);
862         allocated_write_caches--;
863
864         SMB_ASSERT(wcp->data_size == 0);
865
866         SAFE_FREE(wcp->data);
867         SAFE_FREE(fsp->wcp);
868
869         DEBUG(10,("delete_write_cache: File %s deleted write cache\n",
870                   fsp_str_dbg(fsp)));
871 }
872
873 /****************************************************************************
874  Setup the write cache structure.
875 ****************************************************************************/
876
877 static bool setup_write_cache(files_struct *fsp, SMB_OFF_T file_size)
878 {
879         ssize_t alloc_size = lp_write_cache_size(SNUM(fsp->conn));
880         write_cache *wcp;
881
882         if (allocated_write_caches >= MAX_WRITE_CACHES) {
883                 return False;
884         }
885
886         if(alloc_size == 0 || fsp->wcp) {
887                 return False;
888         }
889
890         if((wcp = SMB_MALLOC_P(write_cache)) == NULL) {
891                 DEBUG(0,("setup_write_cache: malloc fail.\n"));
892                 return False;
893         }
894
895         wcp->file_size = file_size;
896         wcp->offset = 0;
897         wcp->alloc_size = alloc_size;
898         wcp->data_size = 0;
899         if((wcp->data = (char *)SMB_MALLOC(wcp->alloc_size)) == NULL) {
900                 DEBUG(0,("setup_write_cache: malloc fail for buffer size %u.\n",
901                         (unsigned int)wcp->alloc_size ));
902                 SAFE_FREE(wcp);
903                 return False;
904         }
905
906         memset(wcp->data, '\0', wcp->alloc_size );
907
908         fsp->wcp = wcp;
909         DO_PROFILE_INC(writecache_allocated_write_caches);
910         allocated_write_caches++;
911
912         DEBUG(10,("setup_write_cache: File %s allocated write cache size %lu\n",
913                   fsp_str_dbg(fsp), (unsigned long)wcp->alloc_size));
914
915         return True;
916 }
917
918 /****************************************************************************
919  Cope with a size change.
920 ****************************************************************************/
921
922 void set_filelen_write_cache(files_struct *fsp, SMB_OFF_T file_size)
923 {
924         if(fsp->wcp) {
925                 /* The cache *must* have been flushed before we do this. */
926                 if (fsp->wcp->data_size != 0) {
927                         char *msg;
928                         if (asprintf(&msg, "set_filelen_write_cache: size change "
929                                  "on file %s with write cache size = %lu\n",
930                                  fsp->fsp_name->base_name,
931                                  (unsigned long)fsp->wcp->data_size) != -1) {
932                                 smb_panic(msg);
933                         } else {
934                                 smb_panic("set_filelen_write_cache");
935                         }
936                 }
937                 fsp->wcp->file_size = file_size;
938         }
939 }
940
941 /*******************************************************************
942  Flush a write cache struct to disk.
943 ********************************************************************/
944
945 ssize_t flush_write_cache(files_struct *fsp, enum flush_reason_enum reason)
946 {
947         write_cache *wcp = fsp->wcp;
948         size_t data_size;
949         ssize_t ret;
950
951         if(!wcp || !wcp->data_size) {
952                 return 0;
953         }
954
955         data_size = wcp->data_size;
956         wcp->data_size = 0;
957
958         DO_PROFILE_DEC_INC(writecache_num_write_caches,writecache_flushed_writes[reason]);
959
960         DEBUG(9,("flushing write cache: fd = %d, off=%.0f, size=%u\n",
961                 fsp->fh->fd, (double)wcp->offset, (unsigned int)data_size));
962
963 #ifdef WITH_PROFILE
964         if(data_size == wcp->alloc_size) {
965                 DO_PROFILE_INC(writecache_num_perfect_writes);
966         }
967 #endif
968
969         ret = real_write_file(NULL, fsp, wcp->data, wcp->offset, data_size);
970
971         /*
972          * Ensure file size if kept up to date if write extends file.
973          */
974
975         if ((ret != -1) && (wcp->offset + ret > wcp->file_size)) {
976                 wcp->file_size = wcp->offset + ret;
977         }
978
979         return ret;
980 }
981
982 /*******************************************************************
983 sync a file
984 ********************************************************************/
985
986 NTSTATUS sync_file(connection_struct *conn, files_struct *fsp, bool write_through)
987 {
988         if (fsp->fh->fd == -1)
989                 return NT_STATUS_INVALID_HANDLE;
990
991         if (lp_strict_sync(SNUM(conn)) &&
992             (lp_syncalways(SNUM(conn)) || write_through)) {
993                 int ret = flush_write_cache(fsp, SYNC_FLUSH);
994                 if (ret == -1) {
995                         return map_nt_error_from_unix(errno);
996                 }
997                 ret = SMB_VFS_FSYNC(fsp);
998                 if (ret == -1) {
999                         return map_nt_error_from_unix(errno);
1000                 }
1001         }
1002         return NT_STATUS_OK;
1003 }
1004
1005 /************************************************************
1006  Perform a stat whether a valid fd or not.
1007 ************************************************************/
1008
1009 int fsp_stat(files_struct *fsp)
1010 {
1011         if (fsp->fh->fd == -1) {
1012                 if (fsp->posix_open) {
1013                         return SMB_VFS_LSTAT(fsp->conn, fsp->fsp_name);
1014                 } else {
1015                         return SMB_VFS_STAT(fsp->conn, fsp->fsp_name);
1016                 }
1017         } else {
1018                 return SMB_VFS_FSTAT(fsp, &fsp->fsp_name->st);
1019         }
1020 }