00001 /* -*- mode:C++; c-basic-offset:4 -*- 00002 Shore-MT -- Multi-threaded port of the SHORE storage manager 00003 00004 Copyright (c) 2007-2009 00005 Data Intensive Applications and Systems Labaratory (DIAS) 00006 Ecole Polytechnique Federale de Lausanne 00007 00008 All Rights Reserved. 00009 00010 Permission to use, copy, modify and distribute this software and 00011 its documentation is hereby granted, provided that both the 00012 copyright notice and this permission notice appear in all copies of 00013 the software, derivative works or modified versions, and any 00014 portions thereof, and that both notices appear in supporting 00015 documentation. 00016 00017 This code is distributed in the hope that it will be useful, but 00018 WITHOUT ANY WARRANTY; without even the implied warranty of 00019 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS 00020 DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER 00021 RESULTING FROM THE USE OF THIS SOFTWARE. 00022 */ 00023 00024 /*<std-header orig-src='shore' incl-file-exclusion='SM_S_H'> 00025 00026 $Id: lsn.h,v 1.5 2010/12/08 17:37:34 nhall Exp $ 00027 00028 SHORE -- Scalable Heterogeneous Object REpository 00029 00030 Copyright (c) 1994-99 Computer Sciences Department, University of 00031 Wisconsin -- Madison 00032 All Rights Reserved. 00033 00034 Permission to use, copy, modify and distribute this software and its 00035 documentation is hereby granted, provided that both the copyright 00036 notice and this permission notice appear in all copies of the 00037 software, derivative works or modified versions, and any portions 00038 thereof, and that both notices appear in supporting documentation. 00039 00040 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY 00041 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS 00042 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND 00043 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 00044 00045 This software was developed with support by the Advanced Research 00046 Project Agency, ARPA order number 018 (formerly 8230), monitored by 00047 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518. 00048 Further funding for this work was provided by DARPA through 00049 Rome Research Laboratory Contract No. F30602-97-2-0247. 00050 00051 */ 00052 00053 #ifndef LSN_H 00054 #define LSN_H 00055 00056 #include "w_defines.h" 00057 00058 /* -- do not edit anything above this line -- </std-header>*/ 00059 00060 #include "w_base.h" 00061 00062 /* FRJ: Major changes to lsn_t 00063 * Once the database runs long enough we will run out of 00064 * partition numbers (only 64k possible). 00065 * Fortunately, this is a log, so lsn_t don't last forever. 00066 * Eventually things become durable and the log partition 00067 * file gets reclaimed (deleted). 00068 * As long as the first partition is gone before 00069 * the last one fills, we can simply wrap and change the 00070 * sense of lsn_t comparisions. 00071 * as follows: 00072 * 00073 Suppose we use unsigned 8 bit partition numbers. If the current 00074 global lsn has pnum >= 128 (MSB set), any lsn we encounter with 00075 pnum < 128 (MSB clear) must be older: 00076 00077 0 1 ... 126 127 128 129 ... 254 255 00078 00079 00080 On the other hand, if the current global lsn has pnum < 128 00081 (MSB clear), any lsn we encounter with pnum >= 128 (MSB set) 00082 must be *older* (ie, the pnum recently wrapped from 255 back to 00083 0): 00084 00085 128 129 ... 254 255 0 1 ... 126 127 00086 00087 The first situation is easy: regular comparisons provide the 00088 ordering we want; The second is trickier due to the wrapping of 00089 partition numbers. Fortunately, there's an easy way around the 00090 problem! Since the MSB of the pnum is also the MSB of the 00091 64-bit value holding the lsn, if comparisons interpret the 00092 64-bit value as signed we get the proper ordering: 00093 00094 -128 -127 ... -2 -1 0 1 ... 126 127 00095 00096 We assume there are enough lsn_t that less than half are in use at 00097 a time. So, we divide the universe of lsn's into four regions, 00098 marked by the most significant two bits of the file. 00099 00100 00+01 - don't care 00101 01+10 - unsigned comparisons 00102 10+11 - unsigned comparisons 00103 11+00 - signed comparisons 00104 00105 For now, though, we'll just assume overflow doesn't happen ;) 00106 */ 00107 00108 typedef w_base_t::int8_t sm_diskaddr_t; 00109 00110 /**\addtogroup LSNS 00111 * 00112 * \section LLR Locates Log Records 00113 * A log sequence number generally points to a record in the log. 00114 * It consists of two parts: 00115 * - hi(), a.k.a., file(). This is a number that matches a log partition 00116 * file, e.g., "log.<file>" 00117 * - lo(), a.k.a., rba(). This is byte-offset into the log partition, and is 00118 * the first byte of a log record, or is the first 00119 * byte after the last log record in the file (where 00120 * the next log record could be written). 00121 * 00122 * \note Once the database runs long enough we will run out of 00123 * partition numbers (only 64k possible). 00124 * Fortunately, this is a log, so lsn_t don't last forever. 00125 * Eventually things become durable and the log partition 00126 * file gets reclaimed (deleted). 00127 * As long as the first partition is gone before 00128 * the last one fills, we can simply wrap and change the 00129 * sense of lsn_t comparisions. 00130 * 00131 * \subsection WORK Identifying Limit of Partial Rollback 00132 * 00133 * A savepoint (sm_save_point_t) is an lsn_t. It tells how far back 00134 * a transaction should undo its actions when ss_m::rollback_work is called. 00135 * 00136 * \section PTS Page Timestamps 00137 * Each page has an lsn_t that acts as a timestamp; it is the 00138 * lsn of the last log record that describes an update to the page. 00139 * 00140 * In recovery, when a page is read in, all log records with sequence 00141 * numbers less than the page lsn have been applied, and redo of these 00142 * log records is not necessary. 00143 * 00144 * The storage manager has other special cases: lsn_t(0,1) -- this is 00145 * in page.cpp, page_p::_format(), and indicates a freshly formatted page 00146 * with no further updates. 00147 * \subsection NPCD Nominal Page Corruption-Detection 00148 * Pages have two copies of their page lsn; one at the head and one at the 00149 * end of the page. Presumably if the two match, the page is uncorrupted, 00150 * though that is no guarantee. Certainly if they do not match, 00151 * something is wrong. 00152 * 00153 * \section BPFS Buffer Pool Frame Status 00154 * The buffer pool's control blocks (bfcb_t) contain an lsn_t, 00155 * the "recovery lsn" or rec_lsn. This is a timestamp that can 00156 * be compared with the page lsns to determine if the copy in the 00157 * buffer pool is up-to-date or not. 00158 * 00159 * A recovery lsn is a lower bound on the lsn of a log record 00160 * that updated the page in the frame. 00161 * A clean page in the buffer pool has a rec_lsn of lsn_t::null. 00162 * Each time a page is fixed in EX mode, the buffer control block 00163 * ensures that the rec_lsn is not lsn_t::null, thereby indicating that 00164 * this page is probably dirty and needs flushing or, possibly, 00165 * is being flushed. 00166 * The rec_lsn is set to the tail of the log at the time the fix is done; this 00167 * ensures that any log record written for an update to the page has at 00168 * least the rec_lsn sequence number. There might be several updates to 00169 * the page before the page is cleaned, so the rec_lsn is indeed a lower 00170 * bound, and that's all we can know about it. 00171 * 00172 * Special cases of log sequence numbers: 00173 * - null: not a valid lsn_t 00174 * - max: soon to overflow 00175 * - lsn(0,1) : used when some pages are formatted. 00176 * 00177 */ 00178 00179 /*\bug GNATS 136: TODO make thread-safe for 32-bit platform */ 00180 /**\brief Log Sequence Number. See \ref LSNS. 00181 * 00182 * \ingroup LSNS 00183 * \details 00184 * 00185 * A log sequence number points to a record in the log. 00186 * It consists of two parts: 00187 * - hi(), a.k.a., file(). This is a number that matches a log partition 00188 * file, e.g., "log.<file>" 00189 * - lo(), a.k.a., rba(). This is byte-offset into the log partition, and is 00190 * the first byte of a log record, or is the first 00191 * byte after the last log record in the file (where 00192 * the next log record could be written). 00193 * 00194 * All state is stored in a single 64-bit value. 00195 * This reading or setting is atomic on 00196 * 64-bit platforms (though updates still need protection). 00197 * \warning This is NOT atomic on 32-bit platforms. 00198 * 00199 * Because all state fits in 64 bits, 00200 * there is a trade-off between maximum supported log partition size 00201 * and number of partitions. Two reasonable choices are: 00202 * 00203 * - 16-bit partition numbers, up to 256TB per partition 00204 * - 32-bit partition numbers, up to 4GB per partition 00205 * 00206 * 48-bit offsets are larger, but (slightly) more expensive and likely 00207 * to wrap sooner. 00208 * 32-bit offsets are still pretty big, and the chance of wrapping 00209 * is *much* smaller (though a production system could theoretically 00210 * hit the limit, since the count persists as long as the database 00211 * exists. 00212 * For now we go with the 32-32 split. 00213 * 00214 * lsn_t no longer cares whether the disk can handle the full range 00215 * it supports. 00216 * If you support 48-bit partition sizes and the disk can 00217 * only handle 32-bit offsets, the largest file will just happen to be 00218 * smaller than lsn_t technically supports. 00219 * 00220 * lsn_t does not cater to unaligned accesses. 00221 * Log writes, in particular, 00222 * are expected to be 8-byte aligned. 00223 * The extra wasted bytes just aren't worth the performance hit of allowing 00224 * misalignment. 00225 * 00226 * \note Once the database runs long enough we will run out of 00227 * partition numbers (only 64k possible). 00228 * Fortunately, this is a log, so lsn_t don't last forever. 00229 * Eventually things become durable and the log partition 00230 * file gets reclaimed (deleted). 00231 * As long as the first partition is gone before 00232 * the last one fills, we can simply wrap and change the 00233 * sense of lsn_t comparisions. 00234 * 00235 */ 00236 class lsn_t { 00237 enum { file_hwm = 0xffff }; 00238 public: 00239 enum { PARTITION_BITS=16 }; 00240 enum { PARTITION_SHIFT=(64-PARTITION_BITS) }; 00241 00242 static w_base_t::uint8_t mask() { 00243 static w_base_t::uint8_t const ONE = 1; 00244 return (ONE << PARTITION_SHIFT)-1; 00245 } 00246 00247 lsn_t() : _data(0) { } 00248 00249 lsn_t(w_base_t::uint4_t f, sm_diskaddr_t r) : 00250 _data(from_file(f) | from_rba(r)) { } 00251 00252 // copy operator 00253 lsn_t(const lsn_t & other) : _data(other._data) { } 00254 00255 bool valid() const { 00256 // valid is essentially iff file != 0 00257 #if W_DEBUG_LEVEL > 1 00258 w_base_t::uint8_t copy_of_data = _data; 00259 w_base_t::uint8_t m = mask(); 00260 bool first = copy_of_data > m; 00261 w_base_t::uint8_t f = 00262 to_file(copy_of_data); 00263 bool second = (f != 0); 00264 w_assert2(first == second); 00265 #endif 00266 return (_data > mask()); 00267 } 00268 00269 w_base_t::uint4_t hi() const { return file(); } 00270 w_base_t::uint4_t file() const { return to_file(_data); } 00271 00272 sm_diskaddr_t lo() const { return rba(); } 00273 sm_diskaddr_t rba() const { return to_rba(_data); } 00274 00275 // WARNING: non-atomic read-modify-write operations! 00276 void copy_rba(const lsn_t &other) { 00277 _data = get_file(_data) | get_rba(other._data); } 00278 void set_rba(sm_diskaddr_t &other) { 00279 _data = get_file(_data) | get_rba(other); } 00280 00281 // WARNING: non-atomic read-modify-write operations! 00282 lsn_t& advance(int amt) { _data += amt; return *this; } 00283 00284 lsn_t &operator+=(long delta) { return advance(delta); } 00285 lsn_t operator+(long delta) const { return lsn_t(*this).advance(delta); } 00286 00287 bool operator>(const lsn_t& l) const { return l < *this; } 00288 bool operator<(const lsn_t& l) const { return _data < l._data; } 00289 bool operator>=(const lsn_t& l) const { return !(*this < l); } 00290 bool operator<=(const lsn_t& l) const { return !(*this > l); } 00291 bool operator==(const lsn_t& l) const { return _data == l._data; } 00292 bool operator!=(const lsn_t& l) const { return !(*this == l); } 00293 00294 00295 /* 00296 * This is the SM's idea of on-disk and in-structure 00297 * things that reflect the size of a "disk". It 00298 * is different from fileoff_t because the two are 00299 * orthogonal. A system may be constructed that 00300 * has big addresses, but is only running on 00301 * a "small file" environment. 00302 */ 00303 static const int sm_diskaddr_max; 00304 00305 static const lsn_t null; 00306 static const lsn_t max; 00307 00308 private: 00309 w_base_t::uint8_t _data; 00310 00311 static w_base_t::uint4_t to_file(w_base_t::uint8_t f) { 00312 return (w_base_t::uint4_t) (f >> PARTITION_SHIFT); } 00313 00314 static w_base_t::uint8_t get_file(w_base_t::uint8_t data) { 00315 return data &~ mask(); } 00316 00317 static w_base_t::uint8_t from_file(w_base_t::uint4_t data) { 00318 return ((w_base_t::uint8_t) data) << PARTITION_SHIFT; } 00319 00320 static sm_diskaddr_t to_rba(w_base_t::uint8_t r) { 00321 return (sm_diskaddr_t) (r & mask()); } 00322 00323 static w_base_t::uint8_t get_rba(w_base_t::uint8_t data) { 00324 return to_rba(data); } 00325 00326 static w_base_t::uint8_t from_rba(sm_diskaddr_t data) { 00327 return to_rba(data); } 00328 }; 00329 #endif