lsn.h

00001 /* -*- mode:C++; c-basic-offset:4 -*-
00002      Shore-MT -- Multi-threaded port of the SHORE storage manager
00003    
00004                        Copyright (c) 2007-2009
00005       Data Intensive Applications and Systems Labaratory (DIAS)
00006                Ecole Polytechnique Federale de Lausanne
00007    
00008                          All Rights Reserved.
00009    
00010    Permission to use, copy, modify and distribute this software and
00011    its documentation is hereby granted, provided that both the
00012    copyright notice and this permission notice appear in all copies of
00013    the software, derivative works or modified versions, and any
00014    portions thereof, and that both notices appear in supporting
00015    documentation.
00016    
00017    This code is distributed in the hope that it will be useful, but
00018    WITHOUT ANY WARRANTY; without even the implied warranty of
00019    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS
00020    DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER
00021    RESULTING FROM THE USE OF THIS SOFTWARE.
00022 */
00023 
00024 /*<std-header orig-src='shore' incl-file-exclusion='SM_S_H'>
00025 
00026  $Id: lsn.h,v 1.5 2010/12/08 17:37:34 nhall Exp $
00027 
00028 SHORE -- Scalable Heterogeneous Object REpository
00029 
00030 Copyright (c) 1994-99 Computer Sciences Department, University of
00031                       Wisconsin -- Madison
00032 All Rights Reserved.
00033 
00034 Permission to use, copy, modify and distribute this software and its
00035 documentation is hereby granted, provided that both the copyright
00036 notice and this permission notice appear in all copies of the
00037 software, derivative works or modified versions, and any portions
00038 thereof, and that both notices appear in supporting documentation.
00039 
00040 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY
00041 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS
00042 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND
00043 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
00044 
00045 This software was developed with support by the Advanced Research
00046 Project Agency, ARPA order number 018 (formerly 8230), monitored by
00047 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518.
00048 Further funding for this work was provided by DARPA through
00049 Rome Research Laboratory Contract No. F30602-97-2-0247.
00050 
00051 */
00052 
00053 #ifndef LSN_H
00054 #define LSN_H
00055 
00056 #include "w_defines.h"
00057 
00058 /*  -- do not edit anything above this line --   </std-header>*/
00059 
00060 #include "w_base.h"
00061 
00062 /* FRJ: Major changes to lsn_t
00063  * Once the database runs long enough we will run out of
00064  * partition numbers (only 64k possible). 
00065  * Fortunately, this is a log, so lsn_t don't last forever. 
00066  * Eventually things become durable and the log partition 
00067  * file gets reclaimed (deleted). 
00068  * As long as the first partition is gone before 
00069  * the last one fills, we can simply wrap and change the 
00070  * sense of lsn_t comparisions.
00071  * as follows:
00072  *
00073    Suppose we use unsigned 8 bit partition numbers. If the current
00074    global lsn has pnum >= 128 (MSB set), any lsn we encounter with
00075    pnum < 128 (MSB clear) must be older:
00076                                                                                
00077    0        1        ...        126        127        128        129        ...         254        255
00078        
00079 
00080    On the other hand, if the current global lsn has pnum < 128
00081    (MSB clear), any lsn we encounter with pnum >= 128 (MSB set)
00082    must be *older* (ie, the pnum recently wrapped from 255 back to
00083    0):
00084 
00085    128        129        ...         254        255        0        1        ...        126        127
00086 
00087    The first situation is easy: regular comparisons provide the
00088    ordering we want; The second is trickier due to the wrapping of
00089    partition numbers. Fortunately, there's an easy way around the
00090    problem!  Since the MSB of the pnum is also the MSB of the
00091    64-bit value holding the lsn, if comparisons interpret the
00092    64-bit value as signed we get the proper ordering:
00093 
00094    -128        -127        ...        -2        -1        0        1        ...        126        127
00095 
00096    We assume there are enough lsn_t that less than half are in use at
00097    a time. So, we divide the universe of lsn's into four regions,
00098    marked by the most significant two bits of the file.
00099 
00100    00+01 - don't care
00101    01+10 - unsigned comparisons
00102    10+11 - unsigned comparisons
00103    11+00 - signed comparisons
00104 
00105    For now, though, we'll just assume overflow doesn't happen ;)
00106 */
00107 
00108 typedef w_base_t::int8_t sm_diskaddr_t;
00109 
00110 /**\addtogroup LSNS 
00111  *
00112  * \section LLR Locates Log Records 
00113  * A log sequence number generally points to a record in the log.
00114  * It consists of two parts:
00115  * - hi(), a.k.a., file(). This is a number that matches a log partition
00116  *                         file, e.g., "log.<file>"
00117  * - lo(), a.k.a., rba(). This is byte-offset into the log partition, and is
00118  *                        the first byte of a log record, or is the first
00119  *                        byte after the last log record in the file (where
00120  *                        the next log record could be written).
00121  *
00122  * \note Once the database runs long enough we will run out of
00123  * partition numbers (only 64k possible). 
00124  * Fortunately, this is a log, so lsn_t don't last forever. 
00125  * Eventually things become durable and the log partition 
00126  * file gets reclaimed (deleted). 
00127  * As long as the first partition is gone before 
00128  * the last one fills, we can simply wrap and change the 
00129  * sense of lsn_t comparisions.
00130  *
00131  * \subsection WORK Identifying Limit of Partial Rollback
00132  *
00133  * A savepoint (sm_save_point_t) is an lsn_t. It tells how far back
00134  * a transaction should undo its actions when ss_m::rollback_work is called.
00135  *
00136  * \section PTS Page Timestamps
00137  * Each page has an lsn_t that acts as a timestamp; it is the
00138  * lsn of the last log record that describes an update to the page.
00139  *
00140  * In recovery, when a page is read in, all log records with sequence
00141  * numbers less than the page lsn have been applied, and redo of these
00142  * log records is not necessary.
00143  *
00144  * The storage manager has other special cases: lsn_t(0,1) -- this is
00145  * in page.cpp, page_p::_format(), and indicates a freshly formatted page
00146  * with no further updates.
00147  * \subsection NPCD Nominal Page Corruption-Detection
00148  * Pages have two copies of their page lsn; one at the head and one at the
00149  * end of the page. Presumably if the two match, the page is uncorrupted,
00150  * though that is no guarantee.  Certainly if they do not match, 
00151  * something is wrong.
00152  *
00153  * \section BPFS Buffer Pool Frame Status 
00154  * The buffer pool's control blocks (bfcb_t) contain an lsn_t,
00155  * the "recovery lsn" or rec_lsn.  This is a timestamp that can
00156  * be compared with the page lsns to determine if the copy in the
00157  * buffer pool is up-to-date or not.
00158  *
00159  * A recovery lsn is a lower bound on the lsn of a log record
00160  * that updated the page in the frame. 
00161  * A clean page in the buffer pool has a rec_lsn of lsn_t::null.
00162  * Each time a page is fixed in EX mode, the buffer control block 
00163  * ensures that the rec_lsn is not lsn_t::null, thereby indicating that
00164  * this page is probably dirty and needs flushing or, possibly, 
00165  * is being flushed. 
00166  * The rec_lsn is set to the tail of the log at the time the fix is done; this
00167  * ensures that any log record written for an update to the page has at
00168  * least the rec_lsn sequence number. There might be several updates to
00169  * the page before the page is cleaned, so the rec_lsn is indeed a lower
00170  * bound, and that's all we can know about it.
00171  *
00172  * Special cases of log sequence numbers:
00173  * - null: not a valid lsn_t
00174  * - max:  soon to overflow
00175  * - lsn(0,1) : used when some pages are formatted.
00176  *
00177  */
00178 
00179 /*\bug GNATS 136: TODO make thread-safe for 32-bit platform */
00180 /**\brief Log Sequence Number. See \ref LSNS.
00181  *
00182  * \ingroup LSNS
00183  * \details
00184  *
00185  * A log sequence number points to a record in the log.
00186  * It consists of two parts:
00187  * - hi(), a.k.a., file(). This is a number that matches a log partition
00188  *                         file, e.g., "log.<file>"
00189  * - lo(), a.k.a., rba(). This is byte-offset into the log partition, and is
00190  *                        the first byte of a log record, or is the first
00191  *                        byte after the last log record in the file (where
00192  *                        the next log record could be written).
00193  *
00194  * All state is  stored in a single 64-bit value. 
00195  * This reading or setting is atomic on 
00196  * 64-bit platforms (though updates still need protection). 
00197  * \warning This is NOT atomic on 32-bit platforms.
00198  *
00199  * Because all state fits in 64 bits, 
00200  * there is a trade-off between maximum supported log partition size 
00201  * and number of partitions. Two reasonable choices are:
00202  *
00203  * - 16-bit partition numbers, up to 256TB per partition
00204  * - 32-bit partition numbers, up to 4GB per partition
00205  *
00206  * 48-bit offsets are larger, but (slightly) more expensive and likely 
00207  * to wrap sooner. 
00208  * 32-bit offsets are still pretty big, and the chance of wrapping 
00209  * is *much* smaller (though a production system could theoretically 
00210  * hit the limit, since the count persists as long as the database 
00211  * exists. 
00212  * For now we go with the 32-32 split.
00213  *
00214  * lsn_t no longer cares whether the disk can handle the full range
00215  * it supports. 
00216  * If you support 48-bit partition sizes and the disk can
00217  * only handle 32-bit offsets, the largest file will just happen to be
00218  * smaller than lsn_t technically supports.
00219  *
00220  * lsn_t does not cater to unaligned accesses. 
00221  * Log writes, in particular, 
00222  * are expected to be 8-byte aligned. 
00223  * The extra wasted bytes just aren't worth the performance hit of allowing
00224  * misalignment.
00225  *
00226  * \note Once the database runs long enough we will run out of
00227  * partition numbers (only 64k possible). 
00228  * Fortunately, this is a log, so lsn_t don't last forever. 
00229  * Eventually things become durable and the log partition 
00230  * file gets reclaimed (deleted). 
00231  * As long as the first partition is gone before 
00232  * the last one fills, we can simply wrap and change the 
00233  * sense of lsn_t comparisions.
00234  *
00235  */
00236 class lsn_t {
00237     enum { file_hwm  =    0xffff }; 
00238 public:
00239     enum { PARTITION_BITS=16 };
00240     enum { PARTITION_SHIFT=(64-PARTITION_BITS) };
00241 
00242     static w_base_t::uint8_t mask() {
00243         static w_base_t::uint8_t const ONE = 1;
00244         return (ONE << PARTITION_SHIFT)-1;
00245     }
00246     
00247     lsn_t() : _data(0) { }
00248 
00249     lsn_t(w_base_t::uint4_t f, sm_diskaddr_t r) : 
00250                 _data(from_file(f) | from_rba(r)) { }
00251 
00252     // copy operator
00253     lsn_t(const lsn_t & other) : _data(other._data) { }
00254 
00255     bool valid()             const { 
00256                                     // valid is essentially iff file != 0
00257 #if W_DEBUG_LEVEL > 1
00258                                     w_base_t::uint8_t  copy_of_data =  _data;
00259                                     w_base_t::uint8_t  m =  mask();
00260                                     bool first = copy_of_data > m;
00261                                     w_base_t::uint8_t  f = 
00262                                                     to_file(copy_of_data);
00263                                     bool second = (f != 0);
00264                                     w_assert2(first == second);
00265 #endif
00266                                    return (_data > mask()); 
00267                             }
00268 
00269     w_base_t::uint4_t hi()   const  { return file(); }
00270     w_base_t::uint4_t file() const { return to_file(_data); }
00271 
00272     sm_diskaddr_t     lo()   const  { return rba(); }
00273     sm_diskaddr_t     rba()  const { return to_rba(_data); }
00274 
00275     // WARNING: non-atomic read-modify-write operations!
00276     void copy_rba(const lsn_t &other) { 
00277                 _data = get_file(_data) | get_rba(other._data); }
00278     void set_rba(sm_diskaddr_t &other) { 
00279                 _data = get_file(_data) | get_rba(other); }
00280     
00281     // WARNING: non-atomic read-modify-write operations!
00282     lsn_t& advance(int amt) { _data += amt; return *this; }
00283 
00284     lsn_t &operator+=(long delta) { return advance(delta); }
00285     lsn_t operator+(long delta) const { return lsn_t(*this).advance(delta); }
00286 
00287     bool operator>(const lsn_t& l) const { return l < *this; }
00288     bool operator<(const lsn_t& l) const { return _data < l._data; }
00289     bool operator>=(const lsn_t& l) const { return !(*this < l); }
00290     bool operator<=(const lsn_t& l) const { return !(*this > l); }
00291     bool operator==(const lsn_t& l) const { return _data == l._data; }
00292     bool operator!=(const lsn_t& l) const { return !(*this == l); }
00293 
00294 
00295 /*
00296  * This is the SM's idea of on-disk and in-structure 
00297  * things that reflect the size of a "disk".  It
00298  * is different from fileoff_t because the two are
00299  * orthogonal.  A system may be constructed that
00300  * has big addresses, but is only running on 
00301  * a "small file" environment.
00302  */
00303     static const int sm_diskaddr_max;
00304 
00305     static const lsn_t null;
00306     static const lsn_t max;
00307     
00308 private:
00309     w_base_t::uint8_t        _data;
00310 
00311     static w_base_t::uint4_t to_file(w_base_t::uint8_t f) { 
00312                 return (w_base_t::uint4_t) (f >> PARTITION_SHIFT); }
00313 
00314     static w_base_t::uint8_t get_file(w_base_t::uint8_t data) { 
00315                 return data &~ mask(); }
00316 
00317     static w_base_t::uint8_t from_file(w_base_t::uint4_t data) { 
00318                 return ((w_base_t::uint8_t) data) << PARTITION_SHIFT; }
00319     
00320     static sm_diskaddr_t to_rba(w_base_t::uint8_t r) { 
00321                 return (sm_diskaddr_t) (r & mask()); }
00322 
00323     static w_base_t::uint8_t get_rba(w_base_t::uint8_t data) { 
00324                 return to_rba(data); }
00325 
00326     static w_base_t::uint8_t from_rba(sm_diskaddr_t data) { 
00327                 return to_rba(data); }
00328 };
00329 #endif

Generated on Mon Jan 2 15:13:56 2012 for Shore Storage Manager by  doxygen 1.4.7