-
Notifications
You must be signed in to change notification settings - Fork 2
/
os_unix.c
1769 lines (1704 loc) · 57.5 KB
/
os_unix.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Symisc unQLite: An Embeddable NoSQL (Post Modern) Database Engine.
* Copyright (C) 2012-2013, Symisc Systems http://unqlite.org/
* Version 1.1.6
* For information on licensing, redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES
* please contact Symisc Systems via:
* or visit:
* http://unqlite.org/licensing.html
*/
/* $SymiscID: os_unix.c v1.3 FreeBSD 2013-04-05 01:10 devel <[email protected]> $ */
#ifndef UNQLITE_AMALGAMATION
#include "unqliteInt.h"
#endif
/*
* Omit the whole layer from the build if compiling for platforms other than Unix (Linux, BSD, Solaris, OS X, etc.).
* Note: Mostly SQLite3 source tree.
*/
#if defined(__UNIXES__)
/** This file contains the VFS implementation for unix-like operating systems
** include Linux, MacOSX, *BSD, QNX, VxWorks, AIX, HPUX, and others.
**
** There are actually several different VFS implementations in this file.
** The differences are in the way that file locking is done. The default
** implementation uses Posix Advisory Locks. Alternative implementations
** use flock(), dot-files, various proprietary locking schemas, or simply
** skip locking all together.
**
** This source file is organized into divisions where the logic for various
** subfunctions is contained within the appropriate division. PLEASE
** KEEP THE STRUCTURE OF THIS FILE INTACT. New code should be placed
** in the correct division and should be clearly labeled.
**
*/
/*
** standard include files.
*/
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/uio.h>
#include <sys/file.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <sys/time.h>
#include <errno.h>
#if defined(__APPLE__)
# include <sys/mount.h>
#endif
/*
** Allowed values of unixFile.fsFlags
*/
#define UNQLITE_FSFLAGS_IS_MSDOS 0x1
/*
** Default permissions when creating a new file
*/
#ifndef UNQLITE_DEFAULT_FILE_PERMISSIONS
# define UNQLITE_DEFAULT_FILE_PERMISSIONS 0644
#endif
/*
** Default permissions when creating auto proxy dir
*/
#ifndef UNQLITE_DEFAULT_PROXYDIR_PERMISSIONS
# define UNQLITE_DEFAULT_PROXYDIR_PERMISSIONS 0755
#endif
/*
** Maximum supported path-length.
*/
#define MAX_PATHNAME 512
/*
** Only set the lastErrno if the error code is a real error and not
** a normal expected return code of UNQLITE_BUSY or UNQLITE_OK
*/
#define IS_LOCK_ERROR(x) ((x != UNQLITE_OK) && (x != UNQLITE_BUSY))
/* Forward references */
typedef struct unixInodeInfo unixInodeInfo; /* An i-node */
typedef struct UnixUnusedFd UnixUnusedFd; /* An unused file descriptor */
/*
** Sometimes, after a file handle is closed by SQLite, the file descriptor
** cannot be closed immediately. In these cases, instances of the following
** structure are used to store the file descriptor while waiting for an
** opportunity to either close or reuse it.
*/
struct UnixUnusedFd {
int fd; /* File descriptor to close */
int flags; /* Flags this file descriptor was opened with */
UnixUnusedFd *pNext; /* Next unused file descriptor on same file */
};
/*
** The unixFile structure is subclass of unqlite3_file specific to the unix
** VFS implementations.
*/
typedef struct unixFile unixFile;
struct unixFile {
const unqlite_io_methods *pMethod; /* Always the first entry */
unixInodeInfo *pInode; /* Info about locks on this inode */
int h; /* The file descriptor */
int dirfd; /* File descriptor for the directory */
unsigned char eFileLock; /* The type of lock held on this fd */
int lastErrno; /* The unix errno from last I/O error */
void *lockingContext; /* Locking style specific state */
UnixUnusedFd *pUnused; /* Pre-allocated UnixUnusedFd */
int fileFlags; /* Miscellanous flags */
const char *zPath; /* Name of the file */
unsigned fsFlags; /* cached details from statfs() */
};
/*
** The following macros define bits in unixFile.fileFlags
*/
#define UNQLITE_WHOLE_FILE_LOCKING 0x0001 /* Use whole-file locking */
/*
** Define various macros that are missing from some systems.
*/
#ifndef O_LARGEFILE
# define O_LARGEFILE 0
#endif
#ifndef O_NOFOLLOW
# define O_NOFOLLOW 0
#endif
#ifndef O_BINARY
# define O_BINARY 0
#endif
/*
** Helper functions to obtain and relinquish the global mutex. The
** global mutex is used to protect the unixInodeInfo and
** vxworksFileId objects used by this file, all of which may be
** shared by multiple threads.
**
** Function unixMutexHeld() is used to assert() that the global mutex
** is held when required. This function is only used as part of assert()
** statements. e.g.
**
** unixEnterMutex()
** assert( unixMutexHeld() );
** unixEnterLeave()
*/
static void unixEnterMutex(void){
#ifdef UNQLITE_ENABLE_THREADS
const SyMutexMethods *pMutexMethods = SyMutexExportMethods();
if( pMutexMethods ){
SyMutex *pMutex = pMutexMethods->xNew(SXMUTEX_TYPE_STATIC_2); /* pre-allocated, never fail */
SyMutexEnter(pMutexMethods,pMutex);
}
#endif /* UNQLITE_ENABLE_THREADS */
}
static void unixLeaveMutex(void){
#ifdef UNQLITE_ENABLE_THREADS
const SyMutexMethods *pMutexMethods = SyMutexExportMethods();
if( pMutexMethods ){
SyMutex *pMutex = pMutexMethods->xNew(SXMUTEX_TYPE_STATIC_2); /* pre-allocated, never fail */
SyMutexLeave(pMutexMethods,pMutex);
}
#endif /* UNQLITE_ENABLE_THREADS */
}
/*
** This routine translates a standard POSIX errno code into something
** useful to the clients of the unqlite3 functions. Specifically, it is
** intended to translate a variety of "try again" errors into UNQLITE_BUSY
** and a variety of "please close the file descriptor NOW" errors into
** UNQLITE_IOERR
**
** Errors during initialization of locks, or file system support for locks,
** should handle ENOLCK, ENOTSUP, EOPNOTSUPP separately.
*/
static int unqliteErrorFromPosixError(int posixError, int unqliteIOErr) {
switch (posixError) {
case 0:
return UNQLITE_OK;
case EAGAIN:
case ETIMEDOUT:
case EBUSY:
case EINTR:
case ENOLCK:
/* random NFS retry error, unless during file system support
* introspection, in which it actually means what it says */
return UNQLITE_BUSY;
case EACCES:
/* EACCES is like EAGAIN during locking operations, but not any other time*/
return UNQLITE_BUSY;
case EPERM:
return UNQLITE_PERM;
case EDEADLK:
return UNQLITE_IOERR;
#if EOPNOTSUPP!=ENOTSUP
case EOPNOTSUPP:
/* something went terribly awry, unless during file system support
* introspection, in which it actually means what it says */
#endif
#ifdef ENOTSUP
case ENOTSUP:
/* invalid fd, unless during file system support introspection, in which
* it actually means what it says */
#endif
case EIO:
case EBADF:
case EINVAL:
case ENOTCONN:
case ENODEV:
case ENXIO:
case ENOENT:
case ESTALE:
case ENOSYS:
/* these should force the client to close the file and reconnect */
default:
return unqliteIOErr;
}
}
/******************************************************************************
*************************** Posix Advisory Locking ****************************
**
** POSIX advisory locks are broken by design. ANSI STD 1003.1 (1996)
** section 6.5.2.2 lines 483 through 490 specify that when a process
** sets or clears a lock, that operation overrides any prior locks set
** by the same process. It does not explicitly say so, but this implies
** that it overrides locks set by the same process using a different
** file descriptor. Consider this test case:
**
** int fd1 = open("./file1", O_RDWR|O_CREAT, 0644);
** int fd2 = open("./file2", O_RDWR|O_CREAT, 0644);
**
** Suppose ./file1 and ./file2 are really the same file (because
** one is a hard or symbolic link to the other) then if you set
** an exclusive lock on fd1, then try to get an exclusive lock
** on fd2, it works. I would have expected the second lock to
** fail since there was already a lock on the file due to fd1.
** But not so. Since both locks came from the same process, the
** second overrides the first, even though they were on different
** file descriptors opened on different file names.
**
** This means that we cannot use POSIX locks to synchronize file access
** among competing threads of the same process. POSIX locks will work fine
** to synchronize access for threads in separate processes, but not
** threads within the same process.
**
** To work around the problem, SQLite has to manage file locks internally
** on its own. Whenever a new database is opened, we have to find the
** specific inode of the database file (the inode is determined by the
** st_dev and st_ino fields of the stat structure that fstat() fills in)
** and check for locks already existing on that inode. When locks are
** created or removed, we have to look at our own internal record of the
** locks to see if another thread has previously set a lock on that same
** inode.
**
** (Aside: The use of inode numbers as unique IDs does not work on VxWorks.
** For VxWorks, we have to use the alternative unique ID system based on
** canonical filename and implemented in the previous division.)
**
** There is one locking structure
** per inode, so if the same inode is opened twice, both unixFile structures
** point to the same locking structure. The locking structure keeps
** a reference count (so we will know when to delete it) and a "cnt"
** field that tells us its internal lock status. cnt==0 means the
** file is unlocked. cnt==-1 means the file has an exclusive lock.
** cnt>0 means there are cnt shared locks on the file.
**
** Any attempt to lock or unlock a file first checks the locking
** structure. The fcntl() system call is only invoked to set a
** POSIX lock if the internal lock structure transitions between
** a locked and an unlocked state.
**
** But wait: there are yet more problems with POSIX advisory locks.
**
** If you close a file descriptor that points to a file that has locks,
** all locks on that file that are owned by the current process are
** released. To work around this problem, each unixInodeInfo object
** maintains a count of the number of pending locks on that inode.
** When an attempt is made to close an unixFile, if there are
** other unixFile open on the same inode that are holding locks, the call
** to close() the file descriptor is deferred until all of the locks clear.
** The unixInodeInfo structure keeps a list of file descriptors that need to
** be closed and that list is walked (and cleared) when the last lock
** clears.
**
** Yet another problem: LinuxThreads do not play well with posix locks.
**
** Many older versions of linux use the LinuxThreads library which is
** not posix compliant. Under LinuxThreads, a lock created by thread
** A cannot be modified or overridden by a different thread B.
** Only thread A can modify the lock. Locking behavior is correct
** if the appliation uses the newer Native Posix Thread Library (NPTL)
** on linux - with NPTL a lock created by thread A can override locks
** in thread B. But there is no way to know at compile-time which
** threading library is being used. So there is no way to know at
** compile-time whether or not thread A can override locks on thread B.
** One has to do a run-time check to discover the behavior of the
** current process.
**
*/
/*
** An instance of the following structure serves as the key used
** to locate a particular unixInodeInfo object.
*/
struct unixFileId {
dev_t dev; /* Device number */
ino_t ino; /* Inode number */
};
/*
** An instance of the following structure is allocated for each open
** inode. Or, on LinuxThreads, there is one of these structures for
** each inode opened by each thread.
**
** A single inode can have multiple file descriptors, so each unixFile
** structure contains a pointer to an instance of this object and this
** object keeps a count of the number of unixFile pointing to it.
*/
struct unixInodeInfo {
struct unixFileId fileId; /* The lookup key */
int nShared; /* Number of SHARED locks held */
int eFileLock; /* One of SHARED_LOCK, RESERVED_LOCK etc. */
int nRef; /* Number of pointers to this structure */
int nLock; /* Number of outstanding file locks */
UnixUnusedFd *pUnused; /* Unused file descriptors to close */
unixInodeInfo *pNext; /* List of all unixInodeInfo objects */
unixInodeInfo *pPrev; /* .... doubly linked */
};
static unixInodeInfo *inodeList = 0;
/*
* Local memory allocation stuff.
*/
static void * unqlite_malloc(sxu32 nByte)
{
SyMemBackend *pAlloc;
void *p;
pAlloc = (SyMemBackend *)unqliteExportMemBackend();
p = SyMemBackendAlloc(pAlloc,nByte);
return p;
}
static void unqlite_free(void *p)
{
SyMemBackend *pAlloc;
pAlloc = (SyMemBackend *)unqliteExportMemBackend();
SyMemBackendFree(pAlloc,p);
}
/*
** Close all file descriptors accumuated in the unixInodeInfo->pUnused list.
** If all such file descriptors are closed without error, the list is
** cleared and UNQLITE_OK returned.
**
** Otherwise, if an error occurs, then successfully closed file descriptor
** entries are removed from the list, and UNQLITE_IOERR_CLOSE returned.
** not deleted and UNQLITE_IOERR_CLOSE returned.
*/
static int closePendingFds(unixFile *pFile){
int rc = UNQLITE_OK;
unixInodeInfo *pInode = pFile->pInode;
UnixUnusedFd *pError = 0;
UnixUnusedFd *p;
UnixUnusedFd *pNext;
for(p=pInode->pUnused; p; p=pNext){
pNext = p->pNext;
if( close(p->fd) ){
pFile->lastErrno = errno;
rc = UNQLITE_IOERR;
p->pNext = pError;
pError = p;
}else{
unqlite_free(p);
}
}
pInode->pUnused = pError;
return rc;
}
/*
** Release a unixInodeInfo structure previously allocated by findInodeInfo().
**
** The mutex entered using the unixEnterMutex() function must be held
** when this function is called.
*/
static void releaseInodeInfo(unixFile *pFile){
unixInodeInfo *pInode = pFile->pInode;
if( pInode ){
pInode->nRef--;
if( pInode->nRef==0 ){
closePendingFds(pFile);
if( pInode->pPrev ){
pInode->pPrev->pNext = pInode->pNext;
}else{
inodeList = pInode->pNext;
}
if( pInode->pNext ){
pInode->pNext->pPrev = pInode->pPrev;
}
unqlite_free(pInode);
}
}
}
/*
** Given a file descriptor, locate the unixInodeInfo object that
** describes that file descriptor. Create a new one if necessary. The
** return value might be uninitialized if an error occurs.
**
** The mutex entered using the unixEnterMutex() function must be held
** when this function is called.
**
** Return an appropriate error code.
*/
static int findInodeInfo(
unixFile *pFile, /* Unix file with file desc used in the key */
unixInodeInfo **ppInode /* Return the unixInodeInfo object here */
){
int rc; /* System call return code */
int fd; /* The file descriptor for pFile */
struct unixFileId fileId; /* Lookup key for the unixInodeInfo */
struct stat statbuf; /* Low-level file information */
unixInodeInfo *pInode = 0; /* Candidate unixInodeInfo object */
/* Get low-level information about the file that we can used to
** create a unique name for the file.
*/
fd = pFile->h;
rc = fstat(fd, &statbuf);
if( rc!=0 ){
pFile->lastErrno = errno;
#ifdef EOVERFLOW
if( pFile->lastErrno==EOVERFLOW ) return UNQLITE_NOTIMPLEMENTED;
#endif
return UNQLITE_IOERR;
}
#ifdef __APPLE__
/* On OS X on an msdos filesystem, the inode number is reported
** incorrectly for zero-size files. See ticket #3260. To work
** around this problem (we consider it a bug in OS X, not SQLite)
** we always increase the file size to 1 by writing a single byte
** prior to accessing the inode number. The one byte written is
** an ASCII 'S' character which also happens to be the first byte
** in the header of every SQLite database. In this way, if there
** is a race condition such that another thread has already populated
** the first page of the database, no damage is done.
*/
if( statbuf.st_size==0 && (pFile->fsFlags & UNQLITE_FSFLAGS_IS_MSDOS)!=0 ){
rc = write(fd, "S", 1);
if( rc!=1 ){
pFile->lastErrno = errno;
return UNQLITE_IOERR;
}
rc = fstat(fd, &statbuf);
if( rc!=0 ){
pFile->lastErrno = errno;
return UNQLITE_IOERR;
}
}
#endif
SyZero(&fileId,sizeof(fileId));
fileId.dev = statbuf.st_dev;
fileId.ino = statbuf.st_ino;
pInode = inodeList;
while( pInode && SyMemcmp((const void *)&fileId,(const void *)&pInode->fileId, sizeof(fileId)) ){
pInode = pInode->pNext;
}
if( pInode==0 ){
pInode = (unixInodeInfo *)unqlite_malloc( sizeof(*pInode) );
if( pInode==0 ){
return UNQLITE_NOMEM;
}
SyZero(pInode,sizeof(*pInode));
SyMemcpy((const void *)&fileId,(void *)&pInode->fileId,sizeof(fileId));
pInode->nRef = 1;
pInode->pNext = inodeList;
pInode->pPrev = 0;
if( inodeList ) inodeList->pPrev = pInode;
inodeList = pInode;
}else{
pInode->nRef++;
}
*ppInode = pInode;
return UNQLITE_OK;
}
/*
** This routine checks if there is a RESERVED lock held on the specified
** file by this or any other process. If such a lock is held, set *pResOut
** to a non-zero value otherwise *pResOut is set to zero. The return value
** is set to UNQLITE_OK unless an I/O error occurs during lock checking.
*/
static int unixCheckReservedLock(unqlite_file *id, int *pResOut){
int rc = UNQLITE_OK;
int reserved = 0;
unixFile *pFile = (unixFile*)id;
unixEnterMutex(); /* Because pFile->pInode is shared across threads */
/* Check if a thread in this process holds such a lock */
if( pFile->pInode->eFileLock>SHARED_LOCK ){
reserved = 1;
}
/* Otherwise see if some other process holds it.
*/
if( !reserved ){
struct flock lock;
lock.l_whence = SEEK_SET;
lock.l_start = RESERVED_BYTE;
lock.l_len = 1;
lock.l_type = F_WRLCK;
if (-1 == fcntl(pFile->h, F_GETLK, &lock)) {
int tErrno = errno;
rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
pFile->lastErrno = tErrno;
} else if( lock.l_type!=F_UNLCK ){
reserved = 1;
}
}
unixLeaveMutex();
*pResOut = reserved;
return rc;
}
/*
** Lock the file with the lock specified by parameter eFileLock - one
** of the following:
**
** (1) SHARED_LOCK
** (2) RESERVED_LOCK
** (3) PENDING_LOCK
** (4) EXCLUSIVE_LOCK
**
** Sometimes when requesting one lock state, additional lock states
** are inserted in between. The locking might fail on one of the later
** transitions leaving the lock state different from what it started but
** still short of its goal. The following chart shows the allowed
** transitions and the inserted intermediate states:
**
** UNLOCKED -> SHARED
** SHARED -> RESERVED
** SHARED -> (PENDING) -> EXCLUSIVE
** RESERVED -> (PENDING) -> EXCLUSIVE
** PENDING -> EXCLUSIVE
**
** This routine will only increase a lock. Use the unqliteOsUnlock()
** routine to lower a locking level.
*/
static int unixLock(unqlite_file *id, int eFileLock){
/* The following describes the implementation of the various locks and
** lock transitions in terms of the POSIX advisory shared and exclusive
** lock primitives (called read-locks and write-locks below, to avoid
** confusion with SQLite lock names). The algorithms are complicated
** slightly in order to be compatible with unixdows systems simultaneously
** accessing the same database file, in case that is ever required.
**
** Symbols defined in os.h indentify the 'pending byte' and the 'reserved
** byte', each single bytes at well known offsets, and the 'shared byte
** range', a range of 510 bytes at a well known offset.
**
** To obtain a SHARED lock, a read-lock is obtained on the 'pending
** byte'. If this is successful, a random byte from the 'shared byte
** range' is read-locked and the lock on the 'pending byte' released.
**
** A process may only obtain a RESERVED lock after it has a SHARED lock.
** A RESERVED lock is implemented by grabbing a write-lock on the
** 'reserved byte'.
**
** A process may only obtain a PENDING lock after it has obtained a
** SHARED lock. A PENDING lock is implemented by obtaining a write-lock
** on the 'pending byte'. This ensures that no new SHARED locks can be
** obtained, but existing SHARED locks are allowed to persist. A process
** does not have to obtain a RESERVED lock on the way to a PENDING lock.
** This property is used by the algorithm for rolling back a journal file
** after a crash.
**
** An EXCLUSIVE lock, obtained after a PENDING lock is held, is
** implemented by obtaining a write-lock on the entire 'shared byte
** range'. Since all other locks require a read-lock on one of the bytes
** within this range, this ensures that no other locks are held on the
** database.
**
** The reason a single byte cannot be used instead of the 'shared byte
** range' is that some versions of unixdows do not support read-locks. By
** locking a random byte from a range, concurrent SHARED locks may exist
** even if the locking primitive used is always a write-lock.
*/
int rc = UNQLITE_OK;
unixFile *pFile = (unixFile*)id;
unixInodeInfo *pInode = pFile->pInode;
struct flock lock;
int s = 0;
int tErrno = 0;
/* If there is already a lock of this type or more restrictive on the
** unixFile, do nothing. Don't use the end_lock: exit path, as
** unixEnterMutex() hasn't been called yet.
*/
if( pFile->eFileLock>=eFileLock ){
return UNQLITE_OK;
}
/* This mutex is needed because pFile->pInode is shared across threads
*/
unixEnterMutex();
pInode = pFile->pInode;
/* If some thread using this PID has a lock via a different unixFile*
** handle that precludes the requested lock, return BUSY.
*/
if( (pFile->eFileLock!=pInode->eFileLock &&
(pInode->eFileLock>=PENDING_LOCK || eFileLock>SHARED_LOCK))
){
rc = UNQLITE_BUSY;
goto end_lock;
}
/* If a SHARED lock is requested, and some thread using this PID already
** has a SHARED or RESERVED lock, then increment reference counts and
** return UNQLITE_OK.
*/
if( eFileLock==SHARED_LOCK &&
(pInode->eFileLock==SHARED_LOCK || pInode->eFileLock==RESERVED_LOCK) ){
pFile->eFileLock = SHARED_LOCK;
pInode->nShared++;
pInode->nLock++;
goto end_lock;
}
/* A PENDING lock is needed before acquiring a SHARED lock and before
** acquiring an EXCLUSIVE lock. For the SHARED lock, the PENDING will
** be released.
*/
lock.l_len = 1L;
lock.l_whence = SEEK_SET;
if( eFileLock==SHARED_LOCK
|| (eFileLock==EXCLUSIVE_LOCK && pFile->eFileLock<PENDING_LOCK)
){
lock.l_type = (eFileLock==SHARED_LOCK?F_RDLCK:F_WRLCK);
lock.l_start = PENDING_BYTE;
s = fcntl(pFile->h, F_SETLK, &lock);
if( s==(-1) ){
tErrno = errno;
rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
goto end_lock;
}
}
/* If control gets to this point, then actually go ahead and make
** operating system calls for the specified lock.
*/
if( eFileLock==SHARED_LOCK ){
/* Now get the read-lock */
lock.l_start = SHARED_FIRST;
lock.l_len = SHARED_SIZE;
if( (s = fcntl(pFile->h, F_SETLK, &lock))==(-1) ){
tErrno = errno;
}
/* Drop the temporary PENDING lock */
lock.l_start = PENDING_BYTE;
lock.l_len = 1L;
lock.l_type = F_UNLCK;
if( fcntl(pFile->h, F_SETLK, &lock)!=0 ){
if( s != -1 ){
/* This could happen with a network mount */
tErrno = errno;
rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
goto end_lock;
}
}
if( s==(-1) ){
rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
}else{
pFile->eFileLock = SHARED_LOCK;
pInode->nLock++;
pInode->nShared = 1;
}
}else if( eFileLock==EXCLUSIVE_LOCK && pInode->nShared>1 ){
/* We are trying for an exclusive lock but another thread in this
** same process is still holding a shared lock. */
rc = UNQLITE_BUSY;
}else{
/* The request was for a RESERVED or EXCLUSIVE lock. It is
** assumed that there is a SHARED or greater lock on the file
** already.
*/
lock.l_type = F_WRLCK;
switch( eFileLock ){
case RESERVED_LOCK:
lock.l_start = RESERVED_BYTE;
break;
case EXCLUSIVE_LOCK:
lock.l_start = SHARED_FIRST;
lock.l_len = SHARED_SIZE;
break;
default:
/* Can't happen */
break;
}
s = fcntl(pFile->h, F_SETLK, &lock);
if( s==(-1) ){
tErrno = errno;
rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
}
}
if( rc==UNQLITE_OK ){
pFile->eFileLock = eFileLock;
pInode->eFileLock = eFileLock;
}else if( eFileLock==EXCLUSIVE_LOCK ){
pFile->eFileLock = PENDING_LOCK;
pInode->eFileLock = PENDING_LOCK;
}
end_lock:
unixLeaveMutex();
return rc;
}
/*
** Add the file descriptor used by file handle pFile to the corresponding
** pUnused list.
*/
static void setPendingFd(unixFile *pFile){
unixInodeInfo *pInode = pFile->pInode;
UnixUnusedFd *p = pFile->pUnused;
p->pNext = pInode->pUnused;
pInode->pUnused = p;
pFile->h = -1;
pFile->pUnused = 0;
}
/*
** Lower the locking level on file descriptor pFile to eFileLock. eFileLock
** must be either NO_LOCK or SHARED_LOCK.
**
** If the locking level of the file descriptor is already at or below
** the requested locking level, this routine is a no-op.
**
** If handleNFSUnlock is true, then on downgrading an EXCLUSIVE_LOCK to SHARED
** the byte range is divided into 2 parts and the first part is unlocked then
** set to a read lock, then the other part is simply unlocked. This works
** around a bug in BSD NFS lockd (also seen on MacOSX 10.3+) that fails to
** remove the write lock on a region when a read lock is set.
*/
static int _posixUnlock(unqlite_file *id, int eFileLock, int handleNFSUnlock){
unixFile *pFile = (unixFile*)id;
unixInodeInfo *pInode;
struct flock lock;
int rc = UNQLITE_OK;
int h;
int tErrno; /* Error code from system call errors */
if( pFile->eFileLock<=eFileLock ){
return UNQLITE_OK;
}
unixEnterMutex();
h = pFile->h;
pInode = pFile->pInode;
if( pFile->eFileLock>SHARED_LOCK ){
/* downgrading to a shared lock on NFS involves clearing the write lock
** before establishing the readlock - to avoid a race condition we downgrade
** the lock in 2 blocks, so that part of the range will be covered by a
** write lock until the rest is covered by a read lock:
** 1: [WWWWW]
** 2: [....W]
** 3: [RRRRW]
** 4: [RRRR.]
*/
if( eFileLock==SHARED_LOCK ){
if( handleNFSUnlock ){
off_t divSize = SHARED_SIZE - 1;
lock.l_type = F_UNLCK;
lock.l_whence = SEEK_SET;
lock.l_start = SHARED_FIRST;
lock.l_len = divSize;
if( fcntl(h, F_SETLK, &lock)==(-1) ){
tErrno = errno;
rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
goto end_unlock;
}
lock.l_type = F_RDLCK;
lock.l_whence = SEEK_SET;
lock.l_start = SHARED_FIRST;
lock.l_len = divSize;
if( fcntl(h, F_SETLK, &lock)==(-1) ){
tErrno = errno;
rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
goto end_unlock;
}
lock.l_type = F_UNLCK;
lock.l_whence = SEEK_SET;
lock.l_start = SHARED_FIRST+divSize;
lock.l_len = SHARED_SIZE-divSize;
if( fcntl(h, F_SETLK, &lock)==(-1) ){
tErrno = errno;
rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
goto end_unlock;
}
}else{
lock.l_type = F_RDLCK;
lock.l_whence = SEEK_SET;
lock.l_start = SHARED_FIRST;
lock.l_len = SHARED_SIZE;
if( fcntl(h, F_SETLK, &lock)==(-1) ){
tErrno = errno;
rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
goto end_unlock;
}
}
}
lock.l_type = F_UNLCK;
lock.l_whence = SEEK_SET;
lock.l_start = PENDING_BYTE;
lock.l_len = 2L;
if( fcntl(h, F_SETLK, &lock)!=(-1) ){
pInode->eFileLock = SHARED_LOCK;
}else{
tErrno = errno;
rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
goto end_unlock;
}
}
if( eFileLock==NO_LOCK ){
/* Decrement the shared lock counter. Release the lock using an
** OS call only when all threads in this same process have released
** the lock.
*/
pInode->nShared--;
if( pInode->nShared==0 ){
lock.l_type = F_UNLCK;
lock.l_whence = SEEK_SET;
lock.l_start = lock.l_len = 0L;
if( fcntl(h, F_SETLK, &lock)!=(-1) ){
pInode->eFileLock = NO_LOCK;
}else{
tErrno = errno;
rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
pInode->eFileLock = NO_LOCK;
pFile->eFileLock = NO_LOCK;
}
}
/* Decrement the count of locks against this same file. When the
** count reaches zero, close any other file descriptors whose close
** was deferred because of outstanding locks.
*/
pInode->nLock--;
if( pInode->nLock==0 ){
int rc2 = closePendingFds(pFile);
if( rc==UNQLITE_OK ){
rc = rc2;
}
}
}
end_unlock:
unixLeaveMutex();
if( rc==UNQLITE_OK ) pFile->eFileLock = eFileLock;
return rc;
}
/*
** Lower the locking level on file descriptor pFile to eFileLock. eFileLock
** must be either NO_LOCK or SHARED_LOCK.
**
** If the locking level of the file descriptor is already at or below
** the requested locking level, this routine is a no-op.
*/
static int unixUnlock(unqlite_file *id, int eFileLock){
return _posixUnlock(id, eFileLock, 0);
}
/*
** This function performs the parts of the "close file" operation
** common to all locking schemes. It closes the directory and file
** handles, if they are valid, and sets all fields of the unixFile
** structure to 0.
**
*/
static int closeUnixFile(unqlite_file *id){
unixFile *pFile = (unixFile*)id;
if( pFile ){
if( pFile->dirfd>=0 ){
int err = close(pFile->dirfd);
if( err ){
pFile->lastErrno = errno;
return UNQLITE_IOERR;
}else{
pFile->dirfd=-1;
}
}
if( pFile->h>=0 ){
int err = close(pFile->h);
if( err ){
pFile->lastErrno = errno;
return UNQLITE_IOERR;
}
}
unqlite_free(pFile->pUnused);
SyZero(pFile,sizeof(unixFile));
}
return UNQLITE_OK;
}
/*
** Close a file.
*/
static int unixClose(unqlite_file *id){
int rc = UNQLITE_OK;
if( id ){
unixFile *pFile = (unixFile *)id;
unixUnlock(id, NO_LOCK);
unixEnterMutex();
if( pFile->pInode && pFile->pInode->nLock ){
/* If there are outstanding locks, do not actually close the file just
** yet because that would clear those locks. Instead, add the file
** descriptor to pInode->pUnused list. It will be automatically closed
** when the last lock is cleared.
*/
setPendingFd(pFile);
}
releaseInodeInfo(pFile);
rc = closeUnixFile(id);
unixLeaveMutex();
}
return rc;
}
/************** End of the posix advisory lock implementation *****************
******************************************************************************/
/*
**
** The next division contains implementations for all methods of the
** unqlite_file object other than the locking methods. The locking
** methods were defined in divisions above (one locking method per
** division). Those methods that are common to all locking modes
** are gather together into this division.
*/
/*
** Seek to the offset passed as the second argument, then read cnt
** bytes into pBuf. Return the number of bytes actually read.
**
** NB: If you define USE_PREAD or USE_PREAD64, then it might also
** be necessary to define _XOPEN_SOURCE to be 500. This varies from
** one system to another. Since SQLite does not define USE_PREAD
** any form by default, we will not attempt to define _XOPEN_SOURCE.
** See tickets #2741 and #2681.
**
** To avoid stomping the errno value on a failed read the lastErrno value
** is set before returning.
*/
static int seekAndRead(unixFile *id, unqlite_int64 offset, void *pBuf, int cnt){
int got;
#if (!defined(USE_PREAD) && !defined(USE_PREAD64))
unqlite_int64 newOffset;
#endif
#if defined(USE_PREAD)
got = pread(id->h, pBuf, cnt, offset);
#elif defined(USE_PREAD64)
got = pread64(id->h, pBuf, cnt, offset);
#else
newOffset = lseek(id->h, offset, SEEK_SET);
if( newOffset!=offset ){
if( newOffset == -1 ){
((unixFile*)id)->lastErrno = errno;
}else{
((unixFile*)id)->lastErrno = 0;
}
return -1;
}
got = read(id->h, pBuf, cnt);
#endif
if( got<0 ){
((unixFile*)id)->lastErrno = errno;
}