sm.h

00001 /* -*- mode:C++; c-basic-offset:4 -*-
00002      Shore-MT -- Multi-threaded port of the SHORE storage manager
00003    
00004                        Copyright (c) 2007-2009
00005       Data Intensive Applications and Systems Labaratory (DIAS)
00006                Ecole Polytechnique Federale de Lausanne
00007    
00008                          All Rights Reserved.
00009    
00010    Permission to use, copy, modify and distribute this software and
00011    its documentation is hereby granted, provided that both the
00012    copyright notice and this permission notice appear in all copies of
00013    the software, derivative works or modified versions, and any
00014    portions thereof, and that both notices appear in supporting
00015    documentation.
00016    
00017    This code is distributed in the hope that it will be useful, but
00018    WITHOUT ANY WARRANTY; without even the implied warranty of
00019    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS
00020    DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER
00021    RESULTING FROM THE USE OF THIS SOFTWARE.
00022 */
00023 
00024 /*<std-header orig-src='shore' incl-file-exclusion='SM_H'>
00025 
00026  $Id: sm.h,v 1.324 2012/01/02 17:02:17 nhall Exp $
00027 
00028 SHORE -- Scalable Heterogeneous Object REpository
00029 
00030 Copyright (c) 1994-99 Computer Sciences Department, University of
00031                       Wisconsin -- Madison
00032 All Rights Reserved.
00033 
00034 Permission to use, copy, modify and distribute this software and its
00035 documentation is hereby granted, provided that both the copyright
00036 notice and this permission notice appear in all copies of the
00037 software, derivative works or modified versions, and any portions
00038 thereof, and that both notices appear in supporting documentation.
00039 
00040 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY
00041 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS
00042 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND
00043 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
00044 
00045 This software was developed with support by the Advanced Research
00046 Project Agency, ARPA order number 018 (formerly 8230), monitored by
00047 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518.
00048 Further funding for this work was provided by DARPA through
00049 Rome Research Laboratory Contract No. F30602-97-2-0247.
00050 
00051 */
00052 
00053 #ifndef SM_H
00054 #define SM_H
00055 
00056 #include "w_defines.h"
00057 
00058 /*  -- do not edit anything above this line --   </std-header>*/
00059 
00060 /*
00061  *  Stuff needed by value-added servers.  NOT meant to be included by
00062  *  internal SM .c files, except to the extent that they need these
00063  *  definitions used in the API.
00064  */
00065 
00066 #ifdef __GNUG__
00067 #pragma interface
00068 #endif
00069 
00070 #ifndef SM_INT_4_H
00071 #include <sm_int_4.h>
00072 #endif
00073 
00074 #ifndef SM_DU_STATS_H
00075 #include <sm_du_stats.h> // declares sm_du_stats_t
00076 #endif
00077 
00078 #ifndef SM_STATS_H
00079 #include <smstats.h> // declares sm_stats_info_t and sm_config_info_t
00080 #endif
00081 
00082 #ifndef SM_S_H
00083 #include <sm_s.h> // declares key_type_s, rid_t, lsn_t
00084 #endif
00085 
00086 #ifndef LEXIFY_H
00087 #include <lexify.h> // declares sortorder with constants
00088 #endif
00089 
00090 #ifndef NBOX_H
00091 #include <nbox.h>   // key_info_t contains nbox_t
00092 #endif /* NBOX_H */
00093 
00094 #ifndef SORT_S_H
00095 #include <sort_s.h> // declares key_info_t
00096 #endif
00097 
00098 /* DOXYGEN Documentation : */
00099 
00100 /**\addtogroup LOGSPACE 
00101  *
00102  * Updates performed by transactions are logged so that
00103  * the can be rolled back (in the event of a transaction abort)
00104  * or restored (in the event of a crash).  Both the old and new values
00105  * of an updated location are logged.  This allows a steal, no-force
00106  * buffer management policy, which means the buffer manager is free
00107  * to write dirty pages to disk at any time and yet does not have
00108  * to write dirty pages for a a transaction to commit.
00109  *
00110  * The log is stored in a set of Unix files, all in the same directory,
00111  * whose path is determined by a run-time option.
00112  * The maximum size of the log is also determined by a run-time option.o
00113  * The proper value of the log size depends on
00114  * the expected transaction mix.  More specifically, it depends on the
00115  * age of the oldest (longest running) transaction in the system and
00116  * the amount of log space used by all active transactions. Here are
00117  * some general rules to determine the  amount  of  free  log  space
00118  * available in the system.
00119  * - Log records between the first log
00120  *   record generated by the oldest active transaction and the most
00121  *   recent log record generated by any transaction cannot be thrown
00122  *   away.
00123  * - Log records from a transaction are no longer needed
00124  *   once the transaction has committed or completely aborted and all
00125  *   updates have made it to disk. Aborting a transaction causes log space
00126  *   to be used, so space is reserved for aborting each transaction.
00127  *   Enough log space must be available to commit or abort all active
00128  *   transactions at all times.
00129  * 
00130  * - Only space starting at the beginning of the log can be reused.  
00131  *   This space can be reused if it contains log records only for 
00132  *   transactions meeting the previous rule.
00133  *
00134  * -  All storage manager calls that update records require log space twice
00135  *    the size of the space updated in the record. All calls that create,
00136  *    append, or truncate records require log space equal to the size
00137  *    created, inserted, or deleted. Log records generated by these calls
00138  *    (generally one per call) have an overhead of approximately 50 bytes.
00139  *
00140  * - The amount of log space reserved for aborting a transaction is equal to 
00141  *   the amount of log space generated by the transaction plus a fudge 
00142  *   factor. 
00143  *   (Where btrees are concerned, a structure modification
00144  *   might be necessary on abort, using more space on abort, or might not be
00145  *   necessary on abort where it was done during forward processing, 
00146  *   using less space on abort.)
00147  *
00148  * - The transaction assumes responsiblity for reserving space in the
00149  *   log so that it can abort, should it need to (without leaving an
00150  *   unrecoverable volume).  The transaction and the log cooperate to
00151  *   reserve space for the transaction's aborting.
00152  *
00153  * - When insufficient log space is available for a transaction, the 
00154  *   transaction is (may be, depending on the server) aborted.
00155  *   The storage manager will return an error indication (out of log space)
00156  *   if it is unable to insert a log record into the log due to
00157  *   insufficient space.
00158  *
00159  * Checkpoints are taken periodically by the storage manager in order to 
00160  * free log space and shorten recovery time.  Checkpoints are "fuzzy" 
00161  * and can do not require the system to pause while they are completing.
00162  *
00163  * See the storage manager constructor ss_m::ss_m for more information
00164  * about handling out-of-logspace conditions.
00165  *
00166  */
00167 
00168 /**\addtogroup SSMOPT
00169  *
00170  * These are the run-time options for the storage manager.
00171  *
00172  * -sm_bufpoolsize : 
00173  *      - type: number
00174  *      - description: This is the size of 
00175  *      the buffer pool in Kb.  Must be large enough to hold at least 32 pages,
00176  *      so it depends on the configured page size.
00177  *      - default: none
00178  *      - required?: yes
00179  *
00180  * -sm_hugetlbfs_path
00181  *      - type: string (full absolute path name)
00182  *      - description: Needed only if you configured --with-hugetlbfs.
00183  *      - default: see \ref CONFIGOPT
00184  *      - required?: no
00185  *
00186  * -sm_reformat_log
00187  *      - type: Boolean
00188  *      - description: If "yes", your log will be clobbered and the storage
00189  *      manager will start up with an entirely new log.
00190  *      - default: no
00191  *      - required?: no
00192  *
00193  * -sm_logdir
00194  *      - type: string (relative or absolutee path name)
00195  *      - description: Location of the log files.
00196  *      - default: none
00197  *      - required?: yes
00198  *
00199  * -sm_logbufsize
00200  *      - type: number
00201  *      - description: size of log buffer in KB.
00202  *      Must be greater than or equal to the larger of
00203  *      (4 times the page size, 64 Kb)
00204  *      and less than or equal to
00205  *      128 times the page_size. This is the size of 
00206  *      the log buffer in Kb.
00207  *      - default: 128
00208  *      - required?: no
00209  *
00210  * -sm_logsize
00211  *      - type: number
00212  *      - description: greater than or equal to 8256 
00213  *      This is the maximum size of the log in Kb.  It is a function of
00214  *      the log buffer size, and  the default is the minimum allowable for
00215  *      the default sm_logbufsize.
00216  *      - default: 128
00217  *      - required?: yes
00218  *
00219  * -sm_log_warn
00220  *      - type: number between 0 and 100 (percentage)
00221  *      - description: percentage of log that, when consumed by active
00222  *      transactions, triggers a callback warning of potential inability
00223  *      to roll back.   Should be less than 50.
00224  *      - default: 45
00225  *      - required?: no
00226  *
00227  * -sm_errlog
00228  *      - type: string (relative or absolute path name OR - )
00229  *      - description: Destination for error messages.  If "-" is given,
00230  *      the destination is stderr.
00231  *      - default: \b -
00232  *      - required?: no
00233  *
00234  * -sm_errlog_level
00235  *      - type: string  (one of none|emerg|fatal|internal|error|warning|info|debug)
00236  *      - description: filter.  Message of this priority or higher are issued to
00237  *      the error log; messages with lower priority are not issued.
00238  *      The priorities are listed from high to low. "none" means no logging
00239  *      will happen.
00240  *      - default: error
00241  *      - required?: no
00242  *
00243  * -sm_locktablesize : 
00244  *      - type: number greater than or equal to 64
00245  *      - description: size of lock manager's hash table will be a prime
00246  *      number near and greater than the given number.
00247  *      - default: 64000 (yields a hash table with 65521 buckets)
00248  *      - required?: no
00249  *
00250  * -sm_lock_escalate_to_page_threshold
00251  *      - type: number greater than or equal to 0
00252  *      - description: after acquiring this many record locks on a page, the lock
00253  *      will be escalated to a page lock. A value of 0 disables escalation to a
00254  *      page lock.
00255  *      - default: 5
00256  *      - required?: no
00257  *
00258  * -sm_lock_escalate_to_store_threshold
00259  *      - type: number greater than or equal to 0
00260  *      - description: after acquiring this many page locks on in a store, 
00261  *      the lock will be escalated to a store lock. 
00262  *      A value of 0 disables escalation to a store lock.
00263  *      - default: 25
00264  *      - required?: no
00265  *      
00266  * -sm_lock_escalate_to_volume_threshold
00267  *      - type: number greater than or equal to 0
00268  *      - description: after acquiring this many store locks on in a volume, 
00269  *      the lock will be escalated to a volume lock. 
00270  *      A value of 0 disables escalation to a volume lock.
00271  *      - default: 0
00272  *      - required?: no
00273  *
00274  * -sm_cc_alg
00275  *      - type: string (one of file | page | record | none)
00276  *      - description: default locking granularity for file operations.
00277  *      This can be overridden on a per-transaction basis with
00278  *      ss_m::set_xct_lock_level().
00279  *      - default: record
00280  *      - required?: no
00281  *
00282  * -sm_backgroundflush
00283  *      - type: Boolean
00284  *      - description: Enables background-flushing of volumes.
00285  *      Must be set to "yes" for sm_num_page_writers to have any effect.
00286  *      - default: yes
00287  *      - required?: no
00288  *
00289  * -sm_num_page_writers
00290  *      - type: number
00291  *      - description: greater than or equal to 0; this is the number of
00292  *      background-flushing threads for each volume. If you have 
00293  *      lots of threads, 
00294  *      a huge buffer pool, and few volumes, you should increase this.
00295  *      If sm_backgroundflush is "no", this value is ignored.
00296  *      - default: 2
00297  *      - required?: no
00298  *
00299  * -sm_prefetch
00300  *      - type: Boolean
00301  *      - description: Enables prefetching for scans.
00302  *      - default: no
00303  *      - required?: no
00304  *
00305  * -sm_logging
00306  *      - type: Boolean
00307  *      - description: Allows you to turn off logging for a run of
00308  *      the storage manager. This is only for experimentation, to
00309  *      measure logging overhead in a limited way.
00310  *      Aborts, rollbacks and restart/recovery 
00311  *      do not work without logging.   Independent concurrent
00312  *      transactions using btrees might not work without logging (this is
00313  *      not well-tested).
00314  *      Each time you start the server, you had better start with a
00315  *      clean device or a device that resulted from a clean shutdown
00316  *      of the prior run.
00317  *      - default: yes
00318  *      - required?: no
00319  *
00320  * -sm_lock_caching
00321  *      - type: Boolean
00322  *      - description: Enables caching of transaction locks in transaction.
00323  *      Can be turned off for experimentation. If no, the default is not
00324  *      to cache locks, but any transaction can turn on caching for itself
00325  *      by calling the ss_m method  set_lock_cache_enable(bool enable).
00326  *      - default: yes
00327  *      - required?: no
00328  *
00329  */
00330 
00331 
00332 /**\addtogroup SSMXCT 
00333  * All storage manager operations on data must be done within the scope of
00334  * a transaction (ss_m::begin_xct, ss_m::commit_xct, ss_m::abort_xct,
00335  * ss_m::chain_xct). 
00336  *
00337  * A very few storage manager operations, such as formatting a volume, are
00338  * called outside the scope of a transaction and the storage manager begins
00339  * its own transaction to do the work.
00340  *
00341  * Operations that fail return an error indication and the storage 
00342  * manager assumes that the server will thereafter abort the 
00343  * transaction in which the error occurred, when abort is indicated.
00344  * Abort is indicated when eUSERABORT or eDEADLOCK is returned and 
00345  * when the erver chooses to abort rather than to work around the problem 
00346  * (whatever it might be, such as eRETRY).
00347  *
00348  * The storage manager does not enforce the aborting of any erroneous
00349  * transactions except, possibly, those that are in danger of 
00350  * running out of log space.
00351  * (This is done with the destructor of the prologue used on each call
00352  * to the storage manager, see next paragraph).
00353  *
00354  * It is always the server's responsibility to abort.
00355  * When the storage manager 
00356  * encounters a eLOGSPACEWARN condition (the log hasn't enough
00357  * space \e at \e this \e moment to abort the running transaction,
00358  * assuming a 1:1 ration of rollback-logging overhead to forward-processing
00359  * logging overhead), it does one of two things:
00360  * - passes the error code eLOGSPACEWARN up the call stack back to the server
00361  *   if the storage manager was constructed with no log-space-warning callback
00362  *   argument (see LOG_WARN_CALLBACK_FUNC, ss_m::ss_m).
00363  * - tries to abort a transaction before passing an error code back up
00364  *   the call stack to the server. Choosing a victim transaction to abort
00365  *   is done by the server in its log-space-warning callback function (passed
00366  *   in on ss_m::ss_m, q.v.
00367  *   Only if that callback function returns a non-null victim transaction
00368  *   and returns eUSERABORT does the storage manager abort that victim
00369  *   before returning eUSERABORT up the call stack. Any other
00370  *   error code returned by the callback function is just returned up
00371  *   the call stack.
00372  *
00373  * \section LOCKS Locks 
00374  *
00375  * The storage manager automatically acquires the 
00376  * necessary locks when the data are read or written.
00377  * The locks thus acquired are normally released at the end of a transaction,
00378  * thus, by default, transactions are two-phase and well-formed (degree 3).
00379  *
00380  * \subsection GRAN Lock Granularity
00381  * The fine-grained locks are normally used for records in files, but
00382  * provision is made for using coarser-grained locks.  The transaction
00383  * has a default lock level associated with it,
00384  * which governs the granularity of locks acquired by the storage manager
00385  * on behalf of the transaction.
00386  * The lock manager provides for lock escalation to coarser locks to
00387  * reduce the locking costs.  See \ref SSMLOCK and smlevel_0::concurrency_t. 
00388  *
00389  * Key-value locking is normally used for B+-Trees. (See \ref MOH1.)
00390  * R*-Trees normally use coarse-granularity locking.
00391  * The locking protocol used with an index is determined when the
00392  * index is created.  A transaction may acquire coarse (index-level)
00393  * locks with explicit calls to the lock manager, but by default, 
00394  * the granularity/level/protocol associated with the index is used.
00395  * See smlevel_0::concurrency_t. 
00396  *
00397  * \section DISTXCT Distributed Transactions
00398  * Storage manager transactions may be used as "threads" (to 
00399  * overload this term) of distributed transactions.  
00400  * Coordination of 2-phase commit must be done externally,
00401  * but the storage manager supports preparing the (local) transaction "thread" 
00402  * for two-phase commit, and it will log the necessary 
00403  * data for recovering in-doubt transactions.
00404  *
00405  * \section ATTACH Threads and Transactions
00406  * Transactions are not tied to storage manager threads (smthread_t, not
00407  * to be confused with a local "thread" of a distributed transaction) in any 
00408  * way other than that a transaction must be \e attached to a
00409  * thread while any storage manager work is being done on behalf of 
00410  * that transaction.   This is how the storage manager knows \e which
00411  * transaction is to acquire the locks and latches, etc.
00412  * But a thread can attach and detach from transactions at will, so
00413  * work may be performed by different threads each time the storage
00414  * manager is called on behalf of a given transaction; this allows the
00415  * server to keep a pool of threads to perform work and allows them to
00416  * perform work on behalf of any active transaction.
00417  *
00418  * \warning
00419  * While there are limited circumstances in which multiple threads can be
00420  * attached to the same transaction \e concurrently and perform storage 
00421  * manager operations on behalf of that transaction concurrently,
00422  * which is a hold-over from the original storage manager, this 
00423  * functionality may be deprecated soon.  The reason for this 
00424  * is that it is extremely difficult to handle errors internally
00425  * when multiple threads are attached to a transaction because 
00426  * partial rollback is impossible in the absence of multiple log streams
00427  * for a transaction.
00428  *
00429  * Under no circumstances may a thread attach to more than one transaction
00430  * at a time.
00431  *
00432  *
00433  * \section EXOTICA Exotica
00434  * The storage manager also provides 
00435  * - partial rollback (ss_m::save_work and ss_m::rollback_work), 
00436  *   which undoes actions but does not release locks,
00437  * - transaction chaining (ss_m::chain_xct), which commits, but retains locks
00438  *   and gives them to a new transaction,
00439  * - lock release (sm_quark_t, ss_m::unlock), allowing less-than-3-degree
00440  *   transactions.
00441  *
00442  *  To reduce the cost (particularly in logging) of loading databases,
00443  *  the storage manager provides for unlogged loading of stores.
00444  *  See \ref SSMSTORE.
00445  */
00446 
00447 
00448 
00449 /** \file sm_vas.h
00450  * \details
00451  * This is the include file that all value-added servers should
00452  * include to get the Shore Storage Manager API.
00453  *
00454  */
00455 /********************************************************************/
00456 
00457 class page_p;
00458 class xct_t;
00459 class device_m;
00460 class vec_t;
00461 class log_m;
00462 class lock_m;
00463 class btree_m;
00464 class file_m;
00465 class pool_m;
00466 class dir_m;
00467 class chkpt_m;
00468 class lid_m; 
00469 class sm_stats_cache_t;
00470 class option_group_t;
00471 class option_t;
00472 class prologue_rc_t;
00473 class rtree_m;
00474 class sort_stream_i;
00475 
00476 /**\addtogroup SSMSP  
00477  * A transaction may perform a partial rollback using savepoints.
00478  * The transaction populates a savepoint by calling ss_m::save_work,
00479  * then it may roll back to that point with ss_m::rollback_work.
00480  * Locks acquired between the save_work and rollback_work are \e not
00481  * released.
00482  */
00483 
00484 /**\brief A point to which a transaction can roll back.
00485  * \ingroup SSMSP
00486  *\details
00487  * A transaction an do partial rollbacks with
00488  * save_work  and rollback_work, which use this class to determine
00489  * how far to roll back.
00490  * It is nothing more than a log sequence number for the work done
00491  * to the point when save_work is called.
00492  */
00493 class sm_save_point_t : public lsn_t {
00494 public:
00495     NORET            sm_save_point_t(): _tid(0,0) {};
00496     friend ostream& operator<<(ostream& o, const sm_save_point_t& p) {
00497         return o << p._tid << ':' << (const lsn_t&) p;
00498     }
00499     friend istream& operator>>(istream& i, sm_save_point_t& p) {
00500         char ch;
00501         return i >> p._tid >> ch >> (lsn_t&) p;
00502     }
00503     tid_t            tid() const { return _tid; }
00504 private:
00505     friend class ss_m;
00506     tid_t            _tid;
00507 };
00508 
00509 /**\addtogroup SSMQK  
00510  * A quark is a marker in the transaction's list of acquired locks.
00511  * One may release all short-duration locks acquired since the quark was inserted 
00512  * into the list via sm_quark_t::open().
00513  * The lock manager modifies the locks acquired inside a quark
00514  * so that non-extent locks are no longer than short-duration.
00515  *
00516  * This is for experimentation only, and is \e not well-tested or supported.
00517  *
00518  * How used:
00519  * \code
00520  * sm_quark_t *q = new sm_quark_t;
00521  * q->open();  // inserts marker in transaction's list.
00522  * ...
00523  * q->close(); // frees short-duration locks to the marker.
00524  * delete q;
00525  * \endcode
00526  *
00527  * Deleting the quark without closing it causes it to be closed.
00528  * Quarks may \e not be used with multi-threaded transactions.
00529  *
00530  * Note that if a transaction has multiple threads attached when
00531  * a thread opens a quark, there is no way to determine where the
00532  * quark takes effect, and since it affects the locks acquired by
00533  * all threads of the transaction, it must be used very carefully
00534  * where multiply-threaded transactions are concerned.
00535  */
00536 
00537 /**\brief List of locks acquired by a transaction since
00538  * the quark was "opened".   
00539  * \ingroup SSMQK
00540  * \details
00541  * When a quark is closed (by calling close()), 
00542  * the release_locks parameter indicates if all short-duration read
00543  * locks acquired during the quark should be released.
00544  * \note Quarks are an experimental feature for use 
00545  * as a building block for a more general nested-transaction facility.
00546  *
00547  * \internal See lock_x.h
00548  */
00549 class sm_quark_t {
00550 public:
00551     NORET            sm_quark_t() {}
00552     NORET            ~sm_quark_t();
00553 
00554     rc_t            open();
00555     rc_t            close(bool release=true);
00556 
00557     tid_t            tid()const { return _tid; }
00558     operator         bool()const { return (_tid != tid_t::null); }
00559     friend ostream& operator<<(ostream& o, const sm_quark_t& q);
00560     friend istream& operator>>(istream& i, sm_quark_t& q);
00561 
00562 private:
00563     friend class ss_m;
00564     tid_t            _tid;
00565 
00566     // disable
00567     sm_quark_t(const sm_quark_t&);
00568     sm_quark_t& operator=(const sm_quark_t&);
00569 
00570 };
00571 
00572 class sm_store_info_t;
00573 class log_entry;
00574 class coordinator;
00575 class tape_t;
00576 /**\brief \b This \b is \b the \b SHORE \b Storage \b Manager \b API.
00577  *\details
00578  * Most of the API for using the storage manager is through this
00579  * interface class.
00580  */
00581 class ss_m : public smlevel_top 
00582 {
00583     friend class pin_i;
00584     friend class sort_stream_i;
00585     friend class prologue_rc_t;
00586     friend class log_entry;
00587     friend class coordinator;
00588     friend class tape_t;
00589 public:
00590 
00591     typedef smlevel_0::LOG_WARN_CALLBACK_FUNC LOG_WARN_CALLBACK_FUNC;
00592     typedef smlevel_0::LOG_ARCHIVED_CALLBACK_FUNC LOG_ARCHIVED_CALLBACK_FUNC;
00593     typedef smlevel_0::ndx_t ndx_t;
00594     typedef smlevel_0::concurrency_t concurrency_t;
00595     typedef smlevel_1::xct_state_t xct_state_t;
00596 
00597     typedef sm_store_property_t store_property_t;
00598 
00599 #ifdef COMMENT
00600     //
00601     // Below is most of the interface for the SHORE Storage Manager.
00602     // The rest is located in pin.h, scan.h, and smthread.h
00603     //
00604 
00605     //
00606     // TEMPORARY FILES/INDEXES
00607     //
00608     // When a file or index is created there is a tmp_flag parameter
00609     // that when true indicates that the file is temporary.
00610     // Operations on a temporary file are not logged and the
00611     // file will be gone the next time the volume is mounted.
00612     //
00613     // TODO: IMPLEMENTATION NOTE on Temporary Files/Indexes:
00614     //        Temp files cannot be trusted after transaction abort.
00615     //            They should be marked for removal.
00616     //
00617     // CODE STRUCTURE:
00618     //    Almost all ss_m functions begin by creating a prologue object
00619     //    whose constructor and descructor check for many common errors.
00620     //    In addition most ss_m::OP() functions now call an ss_m::_OP()
00621     //    function to do the real work.  The ss_m::OP functions should
00622     //    not be called by other ss_m functions, instead the corresponding
00623     //    ss_m::_OP function should be used.
00624     //
00625 
00626 #endif /* COMMENT */
00627 
00628   public:
00629     /**\brief Add storage manager options to the given options group.
00630      *\ingroup SSMINIT
00631      *\details
00632      * @param[in] grp The caller's option group, to which the
00633      * storage manager's options will be added for processing soon.
00634      *
00635      * Before the ss_m constructor can be called, setup_options
00636      * \b must be called.  This will install the storage manager's options and
00637      * initialize any that are not required.
00638      * Once all required options have been set, an ss_m can be constructed.
00639      *
00640      *\note This is not thread-safe.  The application (server) must prevent
00641      * concurrent calls to setup_options.
00642      */
00643     static rc_t setup_options(option_group_t* grp);
00644 
00645     /**\brief  Initialize the storage manager.
00646      * \ingroup SSMINIT
00647      * \details
00648      * @param[in] warn   A callback function. This is called 
00649      * when/if the log is in danger of becoming "too full".
00650      * @param[in] get   A callback function. This is called 
00651      * when the storage manager needs an archived log file to be restored.
00652      *
00653      * When an ss_m object is created, the storage manager initializes itself
00654      * and,
00655      * if the sthreads package has not already been initialized by virtue
00656      * of an sthread_t running, the sthreads package is initialized now.
00657      *
00658      * The log is read and recovery is performed (\ref MHLPS), 
00659      * and control returns to
00660      * the caller, after which time
00661      * storage manager threads (instances of smthread_t) may be constructed and
00662      * storage manager may be used.
00663      *
00664      * The storage manager is used by invoking its static methods.  
00665      * You may use them as follows:
00666      * \code
00667      * ss_m *UNIQ = new ss_m();
00668      *
00669      * W_DO(UNIQ->mount_dev(...))
00670      *     // or
00671      * W_DO(ss_m::mount_dev(...))
00672      * \endcode
00673      * ).
00674      *
00675      * Only one ss_m object may be extant at any time. If you try
00676      * to create another while the one exists, a fatal error will occur
00677      * (your program will choke with a message about your mistake).
00678      *
00679      * The callback argument given to the storage manager constructor
00680      * is called when the storage manager determines that it is in danger
00681      * of running out of log space.  Heuristics are used to guess when
00682      * this is the case.  
00683      *
00684      * If the function \a warn archives and removes log files, the function
00685      * \a get must be provided to restore those log files when the
00686      * storage manager needs them.
00687      *
00688      * For details and examples, see  \ref smlevel_0::LOG_WARN_CALLBACK_FUNC, 
00689      *  \ref smlevel_0::LOG_ARCHIVED_CALLBACK_FUNC, and 
00690      *  \ref LOGSPACE.
00691      */
00692     ss_m(LOG_WARN_CALLBACK_FUNC warn=NULL, LOG_ARCHIVED_CALLBACK_FUNC get=NULL);
00693 
00694     /**\brief  Shut down the storage manager.
00695      * \ingroup SSMINIT
00696      * \details
00697      * When the storage manager object is deleted, it shuts down.
00698      * Thereafter it is not usable until another ss_m object is 
00699      * constructed.
00700      */
00701     ~ss_m();
00702 
00703     /**\brief Cause the storage manager's shutting down do be done cleanly 
00704      * or to simulate a crash.
00705      * \ingroup SSMINIT
00706      * \details
00707      * @param[in] clean   True means shut down gracefully, false means simulate a crash.
00708      *
00709      * When the storage manager's destructor is called
00710      * the buffer pool is flushed to disk, unless this method is called 
00711      * with \a clean == \e false.
00712      *
00713      * \note If this method is used, it
00714      * must be called after the storage manager is 
00715      * constructed if it is to take effect. Each time the storage
00716      * manager is constructed, the state associated with this is set
00717      * to \e true, i.e., "shut down properly".
00718      *
00719      * \note This method is not thread-safe, only one thread should use this
00720      * at any time, presumably just before shutting down.
00721      */
00722     static void         set_shutdown_flag(bool clean);
00723 
00724     /**\brief Notify storage manager when a log file was archived by a
00725      * LOG_WARN_CALLBACK_FUNC.
00726      *
00727      * The arguments:
00728      * @param[in] logfile   Character string name of file archived.
00729      */
00730     static rc_t         log_file_was_archived(const char * logfile);
00731 
00732 private:
00733     void                _construct_once(LOG_WARN_CALLBACK_FUNC x=NULL,
00734                                            LOG_ARCHIVED_CALLBACK_FUNC y=NULL);
00735     void                _destruct_once();
00736 
00737 
00738 public:
00739     /**\addtogroup SSMXCT
00740      *
00741      * All work performed on behalf of a transaction must occur while that
00742      * transaction is "attached" to the thread that performs the work.
00743      * Creating a transaction attaches it to the thread that creates the transaction. 
00744      * The thread may detach from the transaction and attach to another.
00745      * Multiple threads may attach to a single transaction and do work in certain circumstances.   See \ref SSMMULTIXCT
00746      *
00747      * 
00748      */
00749     /**\brief Begin a transaction 
00750      *\ingroup SSMXCT
00751      * @param[in] timeout   Optional, controls blocking behavior.
00752      * \details
00753      *
00754      * Start a new transaction and "attach" it to this thread. 
00755      * No running transaction may be attached to this thread.
00756      * 
00757      * Storage manager methods that must block (e.g., to acquire a lock) 
00758      * will use the timeout given.  
00759      * The default timeout is the one associated with this thread.
00760      *
00761      * \sa timeout_in_ms
00762      */
00763     static rc_t           begin_xct(
00764         timeout_in_ms            timeout = WAIT_SPECIFIED_BY_THREAD);
00765 
00766     /**\brief Begin an instrumented transaction. 
00767      *\ingroup SSMXCT
00768      * @param[in] stats   Pointer to an allocated statistics-holding structure.
00769      * @param[in] timeout   Optional, controls blocking behavior.
00770      * \details
00771      * No running transaction may be already attached to this thread.
00772      * A new transaction is started and attached to the running thread.
00773      *
00774      * The transaction will be instrumented.
00775      * This structure is updated by the storage manager whenever a thread
00776      * detaches from this transaction.  The activity recorded during
00777      * the time the thread is attached to the transcation will be stored in
00778      * the per-transaction statistics.
00779      * \attention It is the client's 
00780      * responsibility to delete the statistics-holding structure.
00781      * 
00782      * Storage manager methods that must block (e.g., to acquire a lock) 
00783      * will use the timeout given.  
00784      * The default timeout is the one associated with this thread.
00785      *
00786      * \sa timeout_in_ms
00787      */
00788     static rc_t           begin_xct(
00789         sm_stats_info_t*         stats,  // allocated by caller
00790         timeout_in_ms            timeout = WAIT_SPECIFIED_BY_THREAD);
00791 
00792     /**\brief Begin a transaction and return the transaction id.
00793      *\ingroup SSMXCT
00794      * @param[out] tid      Transaction id of new transaction.
00795      * @param[in] timeout   Optional, controls blocking behavior.
00796      * \details
00797      *
00798      * No running transaction may be attached to this thread.
00799      * 
00800      * Storage manager methods that must block (e.g., to acquire a lock) 
00801      * will use the timeout given.  
00802      * The default timeout is the one associated with this thread.
00803      *
00804      * \sa timeout_in_ms
00805      */
00806     static rc_t           begin_xct(
00807         tid_t&                   tid,
00808         timeout_in_ms            timeout = WAIT_SPECIFIED_BY_THREAD);
00809 
00810     /**\addtogroup SSM2PC  
00811      * The storage manager contains support for externally-coordinated
00812      * transactions that use
00813      * two-phase-commit with presumed abort.
00814      * The server must provide the coordination and the coordinator is
00815      * assumed to have its own stable storage, and it is assumed to recover
00816      * from failures in a "short time", the precise meaning of which is given below.
00817      * A prepared transaction, like an active transaction,
00818      * consumes log space and holds locks.
00819      * Even if a prepared transaction does not hold locks needed by 
00820      * other transactions, it consumes resources in a way that can interfere 
00821      * with other transactions.
00822      * If a prepared transaction remains in the system for a long time 
00823      * while other transactions are running, eventually the storage 
00824      * manager needs the log space used (reserved) by the prepared transaction.
00825      * A coordinator must resolve its prepared transactions
00826      * before the storage manager effectively runs out of 
00827      * log space for other transactions in the system.
00828      * The amount of time involved is a function of the size of the log
00829      * and of the demands of the other transactions in the system.
00830      *
00831      * For the purpose of this discussion, the portion of a global 
00832      * transaction that involves a single Shore Storage Manager transaction is 
00833      * called a thread of the global transaction.
00834      *
00835      * A Shore transaction participates as a thread of a global transaction
00836      * as follows:
00837      - Start a storage-manager transaction with ss_m::begin_xct.
00838      - Acquire a global transaction identifier from the coordinator.
00839      - Indicate to the storage manager that this transaction is a 
00840      thread of a global transaction, and associate the global transaction 
00841      identifier with this thread by calling ss_m::enter_2pc.
00842      - Associate a coordinator with the transaction for recovery 
00843      purposes, by calling ss_m::set_coordinator.
00844      - Prepare the thread of the transaction and get the storage manager's 
00845      vote with ss_m::prepare_xct.  
00846      It is an error to commit a global transaction thread without first 
00847      preparing it.  It is an error to do anything else 
00848      in a transaction after it is prepared, except to end 
00849      the transaction or retry the prepare (to get the vote again).
00850      - Convey the vote to the coordinator, and determine the transaction's 
00851      fate from the coordinator.
00852      - End the thread with ss_m::commit_xct or ss_m::abort_xct.
00853      *
00854      * The storage manager 
00855      * logs the minimal information required to effect a vote of the
00856      * transaction threads that are storage manager transactions,
00857      * and to recover such in-doubt transactions after restart.
00858      * Thus, after a crash/restart, the server may query the storage manager
00859      * about in-doubt (prepared) transactions with ss_m::query_prepared_xct,
00860      * which tells the caller the number and global transaction IDs associated
00861      * with prepared transactions.
00862      * Using this, the server contacts the coordinator and resumes the
00863      * voting.
00864      * The server may find the local transaction IDs and use ss_m::tid_to_xct
00865      * to attach these transactions  and to resolve them.
00866      * 
00867      * Commit and abort of read-only transactions are the same,
00868      * as these transactions have no log entries.  Preparing read-only transactions
00869      * causes them to commit/abort and the vote returned is vote_readonly.
00870      * Once this vote is communicated to the coordinator and the coordinator
00871      * records it on stable storage, there is no need to involve this thread in
00872      * any further processing.  For this reason,
00873      * read-only transactions do not appear as prepared transactions at
00874      * recovery time.
00875      * 
00876      */
00877 
00878     /**\brief Make the attached transaction a thread of a distributed transaction.
00879      *\ingroup SSM2PC
00880      *
00881      * @param[in] gtid    Global transaction ID to associate with this transaction.  This will be logged when the transaction is prepared.
00882      * 
00883      * \note This can be called at most once for a given transaction.
00884      * The transaction must be attached to the calling thread.
00885      * No other threads may be attached to the transaction.
00886      */
00887     static rc_t           enter_2pc(const gtid_t &gtid); 
00888     /**\brief Assign a coordinator handle to this distributed transaction.
00889      *\ingroup SSM2PC
00890      * @param[in] h      Handle of the coordinator.  Not interpreted by
00891      * the storage manager.
00892      *
00893      * The storage manager associates this server handle with the transaction 
00894      * so that when the transaction is prepared, this information is 
00895      * written to the log. Upon recovery, if this transaction is still in doubt,
00896      * the value-added server can query the 
00897      * storage manager for in-doubt transactions, get their server handles,
00898      * and resolve the transactions.
00899      * See query_prepared_xct and recover_2pc.
00900      */
00901     static rc_t           set_coordinator(const server_handle_t &h); 
00902 
00903     /**\brief Prepare a thread of a distributed transaction.
00904      *\ingroup SSM2PC
00905      * @param[in] stats     Pointer to an allocated statistics-holding 
00906      *                      structure.
00907      * @param[out] vote     This thread's vote.
00908      *
00909      * The storage manager will prepare the attached transaction (a thread
00910      * of a distributed transaction) for commit.
00911      * If this transaction has performed no logged updates, the 
00912      * vote returned will be vote_readonly.
00913      * If this transaction can commit, the vote returned will be vote_commit.
00914      * If an error occurs during the prepare, the vote will be vote_abort.
00915      *
00916      * If the transaction is being instrumented, the 
00917      * statistics-holding structure will be returned to the caller, 
00918      * and the caller is responsible for its deallocation.
00919      */
00920     static rc_t           prepare_xct(
00921                             sm_stats_info_t*&         stats, 
00922                             vote_t&                   vote); 
00923 
00924     /**\brief Prepare a thread of a distributed transaction.
00925      *\ingroup SSM2PC
00926      * @param[out] vote     This thread's vote. See \ref w_base_t::vote_t.
00927      *
00928      * The storage manager will prepare the attached transaction (a thread
00929      * of a distributed transaction) for commit.
00930      * If this transaction has performed no logged updates, the 
00931      * vote returned will be vote_readonly.
00932      * If this transaction can commit, the vote returned will be vote_commit.
00933      * If an error occurs during the prepare, the vote will be vote_abort.
00934      */
00935     static rc_t           prepare_xct(vote_t &vote); 
00936 
00937     /**\brief Force the transaction to vote "read-only" in a two-phase commit. 
00938      *\ingroup SSM2PC
00939      * \details
00940      * This will override the storage manager's determination of 
00941      * whether this thread of a distributed transaction is read-only, which is
00942      * based on whether the local transaction thread logged anything. This
00943      * method may be useful if the local transaction rolled back to 
00944      * a savepoint.
00945      * See  \ref w_base_t::vote_t.
00946      */
00947     static rc_t           force_vote_readonly(); 
00948 
00949     /**\brief Given a global transaction id, find the local prepared 
00950      * transaction associated with it. 
00951      *\ingroup SSM2PC
00952      * @param[in] gtid     A global transaction ID (an opaque quantity 
00953      * to the storage manager).
00954      * @param[in] mayblock Not used.
00955      * @param[out] local   Return the transaction ID of the prepared 
00956      * SM transaction.
00957      * \details
00958      * Searches the transaction list for a prepared transaction with the given
00959      * global transaction id. If found, it returns a reference to the 
00960      * local transaction.  The transaction is attached to the running
00961      * thread before it is returned.
00962      */
00963     static rc_t           recover_2pc(const gtid_t & gtid,
00964         bool                      mayblock,
00965         tid_t &                   local
00966         );
00967 
00968     /**\brief  Return the number of prepared transactions.
00969      *\ingroup SSM2PC
00970      * @param[out] numtids   The number of in-doubt transactions.
00971      * \details
00972      * Used by a server at start-up, after recovery, to find out if
00973      * there are any in-doubt transactions.  If so, the server must
00974      * use the second form of query_prepared_xct to find the global
00975      * transaction IDs of these in-doubt transactions.
00976      */
00977     static rc_t           query_prepared_xct(int &numtids);
00978 
00979     /**\brief  Return the global transaction IDs of in-doubt transactions. 
00980      *\ingroup SSM2PC
00981      * @param[in] numtids   The number of global transaction ids in the list.
00982      * @param[in] l   The caller-provided list into which to write the 
00983      * global transaction-ids.
00984      * \details
00985      * Used by a server at start-up, after recovery, to find out the
00986      * global transaction IDs of the prepared transactions.  The storage
00987      * manager fills in the first numtids entries of the pre-allocated list.
00988      * The server may have first called the first form of query_prepared_xct
00989      * to find out how many such transactions there are after recovery.
00990      *
00991      * \attention Read-only transactions 
00992      * do not appear as in-doubt transactions. Because they did not
00993      * generate any log records, they will not be "discovered" by analysis.
00994      * The server must determine that any thread of a global transaction that
00995      * does not appear to be in doubt was a read-only thread or
00996      * it never prepared and thus has been aborted.
00997      * Read-only transactions that were prepared would have voted read-only,
00998      * and if the coordinator recorded that vote on stable storage, it
00999      * should not be concerned with these transaction threads any further.
01000      * If the coordinator does not have this information recorded, the
01001      * transaction thread could have been an aborted non-read-only transaction,
01002      * so the coordinator must, in this case, presume that the thread aborted
01003      * and thus make the global transaction abort.
01004      */
01005     static rc_t           query_prepared_xct(int numtids, gtid_t l[]);
01006 
01007 
01008     /**\brief Commit a transaction.
01009      *\ingroup SSMXCT
01010      * @param[in] lazy   Optional, controls flushing of log.
01011      * @param[out] plastlsn   If non-null, this is a pointer to a
01012      *                    log sequence number into which the storage
01013      *                    manager writes the that of the last log record
01014      *                    inserted for this transaction.
01015      * \details
01016      *
01017      * Commit the attached transaction and detach it, destroy it.
01018      * If \a lazy is true, the log is not synced.  This means that
01019      * recovery of this transaction might not be possible.
01020      */
01021     static rc_t           commit_xct(
01022                                      bool   lazy = false,
01023                                      lsn_t* plastlsn=NULL);
01024 
01025     /**\brief Commit an instrumented transaction and get its statistics.
01026      *\ingroup SSMXCT
01027      * @param[out] stats   Get a copy of the statistics for this transaction.
01028      * @param[in] lazy   Optional, controls flushing of log.
01029      * @param[out] plastlsn   If non-null, this is a pointer to a
01030      *                    log sequence number into which the storage
01031      *                    manager writes the that of the last log record
01032      *                    inserted for this transaction.
01033      * \details
01034      *
01035      * Commit the attached transaction and detach it, destroy it.
01036      * If \a lazy is true, the log is not synced.  This means that
01037      * recovery of this transaction might not be possible.
01038      */
01039     static rc_t            commit_xct(
01040                                     sm_stats_info_t*& stats, 
01041                                     bool              lazy = false,
01042                                     lsn_t*            plastlsn=NULL);
01043 
01044     /**\brief Commit an instrumented transaction and start a new one.
01045      *\ingroup SSMXCT
01046      * @param[out] stats   Get a copy of the statistics for the first transaction.
01047      * @param[in] lazy   Optional, controls flushing of log.
01048      * \details
01049      *
01050      * Commit the attached transaction and detach it, destroy it.
01051      * Start a new transaction and attach it to this thread.
01052      * \note \e The \e new 
01053      * \e transaction \e inherits \e the \e locks \e of \e the \e old 
01054      * \e transaction.
01055      *
01056      * If \a lazy is true, the log is not synced.  This means that
01057      * recovery of this transaction might not be possible.
01058      */
01059     static rc_t            chain_xct(
01060         sm_stats_info_t*&         stats,    /* in w/new, out w/old */
01061         bool                      lazy = false);  
01062 
01063     /**\brief Commit a transaction and start a new one, inheriting locks.
01064      *\ingroup SSMXCT
01065      * @param[in] lazy   Optional, controls flushing of log.
01066      * \details
01067      *
01068      * Commit the attached transaction and detach it, destroy it.
01069      * Start a new transaction and attach it to this thread.
01070      * \note \e The \e new 
01071      * \e transaction \e inherits \e the \e locks \e of \e the \e old 
01072      * \e transaction.
01073      *
01074      * If \a lazy is true, the log is not synced.  This means that
01075      * recovery of the committed transaction might not be possible.
01076      */
01077     static rc_t            chain_xct(bool lazy = false);  
01078 
01079 
01080     /**\brief Commit a group of transactions.
01081      *\ingroup SSMXCT
01082      * @param[in] list      List of pointers to transactions to commit.
01083      * @param[in] listlen   Number of transactions in the list.
01084      * \details
01085      *
01086      * Commit each transaction in the list as an all-or-none affair.
01087      * Any transaction that is attached to the thread will be
01088      * detached before anything is done.
01089      *
01090      * The purpose of this method is to allow multiple transactions 
01091      * to commit together with a single log record. No voting takes place.
01092      * The entire list of transaction identifiers must fit in a single
01093      * log record. If it does not, a descriptive error will be returned and no 
01094      * transaction will be committed. In this case, the server has the
01095      * option to singly commit each transaction.
01096      *
01097      * If any other error occurs during one of the commits, the error
01098      * will be returned to the caller and none of the transactions
01099      * will be committed; they \b must be aborted thereafter.
01100      *
01101      * This is not intended to be used with transactions that are
01102      * participating in two-phase commit, but if
01103      * one of the transactions is participating in two-phase commit,
01104      * they all must be and they all must be prepared.  
01105      *
01106      * Chaining and lazy commit are not offered with this form of commit.
01107      * If a transaction in the list is instrumented, its statistics
01108      * resources will be deleted upon successful commit.
01109      *
01110      * \note 
01111      * By taking a list of transaction pointers, this avoids a the tid_to_xct lookup 
01112      * for each transaction, but the server must regard the transaction pointers as
01113      * invalid after this method returns.
01114      * The transactions, once committed, do not exist anymore. 
01115      * If an error is returned, the server has to re-verify the transaction pointers 
01116      * by using ss_m::tid_to_xct from a separate list of transaction ids to determine
01117      * which transactions are extant.
01118      */
01119     static rc_t            commit_xct_group(
01120         xct_t *               list[],
01121         int                   listlen);
01122 
01123     /**\brief Abort an instrumented transaction and get its statistics.
01124      *\ingroup SSMXCT
01125      * @param[out] stats   Get a copy of the statistics for this transaction.
01126      * \details
01127      *
01128      * Abort the attached transaction and detach it, destroy it.
01129      */
01130     static rc_t            abort_xct(sm_stats_info_t*&  stats);
01131     /**\brief Abort a transaction.
01132      *\ingroup SSMXCT
01133      * \details
01134      *
01135      * Abort the attached transaction and detach it, destroy it.
01136      */
01137     static rc_t            abort_xct();
01138 
01139     /**\brief Populate a save point.
01140      *\ingroup SSMSP
01141      * @param[out] sp   An sm_save_point_t owned by the caller.
01142      *\details
01143      * Store in sp the needed information to be able to roll back 
01144      * to this point. 
01145      * For use with rollback_work.
01146      * \note Only one thread may be attached to a transaction when this
01147      * is called.
01148      */
01149     static rc_t            save_work(sm_save_point_t& sp);
01150 
01151     /**\brief Roll back to a savepoint.
01152      *\ingroup SSMSP
01153      * @param[in] sp   An sm_save_point_t owned by the caller and
01154      * populated by save_work.
01155      *\details
01156      * Undo everything that was 
01157      * done from the time save_work was called on this savepoint.
01158      * \note Locks are not freed.
01159      *
01160      * \note Only one thread may be attached to a transaction when this
01161      * is called.
01162      */
01163     static rc_t            rollback_work(const sm_save_point_t& sp);
01164 
01165     /**\brief Return the number of transactions in active state.
01166      *\ingroup SSMXCT
01167      * \details
01168      * While this is thread-safe, the moment a value is returned, it could
01169      * be out of date.
01170      * Useful only for debugging.
01171      */
01172     static w_base_t::uint4_t     num_active_xcts();
01173 
01174     /**\brief Attach the given transaction to the currently-running smthread_t.
01175      *\ingroup SSMXCT
01176      * \details
01177      * It is assumed that the currently running thread is an smthread_t.
01178      */
01179     static void           attach_xct(xct_t *x) { me()->attach_xct(x); }
01180 
01181     /**\addtogroup SSMMULTIXCT 
01182      * 
01183      * Certain operations may be performed while more than one
01184      * thread is attached to a transaction (this functionality might be
01185      * removed in a future release).
01186      * Any number of attached threads may be read-only.
01187      * The kinds of updates that can be made by multiple threads are limited by
01188      * the need to avoid latch-mutex and latch-latch deadlocks. 
01189      *
01190      * There are several reasons for this.
01191      * 1) The multiple threads are not protected from each other by locks.
01192      * 2) Interleaving of top-level actions is not supported with rollback;
01193      * this means that for the duration of a top-level action, a thread needs
01194      * access to the log that excludes all other threads in 
01195      * the same transaction.
01196      *
01197      * The internal logging protocol is this:
01198      * T1: latch page, log update. Logging requires acquiring a mutex
01199      * on the xct's log buffer.
01200      * T2: performing any top-level action, acquires the mutex on the
01201      * xct's log buffer before doing the action (latching the page).
01202      *
01203      * Thus, anything involving top-level actions is suspect.  B-trees
01204      * use top-level actions, as does file-page allocation, and creation/
01205      * destruction of stores (files, indexes).  Thus, just about
01206      * any kind of concurrent updates on the same page
01207      * in the same transaction is problematic, and just about any update
01208      * can result in latching extent-map or store-map pages.
01209      * This activity could be disallowed by enforcing a strict 
01210      * rule that at most  one update operation can be going on 
01211      * in a transaction at any time, however this is too restrictive.
01212      *
01213      * Multiple updating threads can
01214      * work \b if \b the \b data \b are \b partitioned by volume.
01215      * So a well-behaved server may use multiple-threaded transactions
01216      * to do updates as long as the updates are on different \b volumes.
01217      * It might also allow read-only transaction threads to be
01218      * concurrent with a single updating thread.
01219      *
01220      * Savepoints and partial rollback may \e not be used with 
01221      * multi-threaded transactions. This is not enforced by the storage
01222      * manager; it is poor behavior on the part of a server.
01223      * For example, the behavior of the following is undefined:
01224      * - thread 1: attach, read,      read,   read, ...
01225      * - thread 2: attach, save work, update, rollback
01226      * If the two threads are reading and possibly updating the same 
01227      * data, the results are timing-dependent and could produce a latch-
01228      * latch or latch-mutex deadlock.
01229      *
01230      * Ongoing research at DIAS is investigating ways to extend the usefulness
01231      * of parallelism within a transaction (multi-threaded transactions).
01232      * Current thoughts about this are for servers to coordinate multiple 
01233      * transactions using two-phase commit or an optimized version
01234      * of commit and abort for groups of local transactions.
01235      */
01236 
01237     /**\brief Detach any attached from the currently-running smthread_t.
01238      *\ingroup SSMXCT
01239      * \details
01240      * Sever the connection between the running thread and the transaction.
01241      * This allow the running thread to attach a different 
01242      * transaction and to perform work in its behalf.
01243      */
01244     static void           detach_xct() { xct_t *x = me()->xct();
01245                                         if(x) me()->detach_xct(x); }
01246 
01247     /**\brief Get the transaction structure for a given a transaction id.
01248      *\ingroup SSMXCT
01249      * @param[in] tid   Transaction ID.
01250      *\details
01251      * Return a pointer to the storage manager's transaction structure.
01252      * Can be used with detach_xct and attach_xct.
01253      */
01254     static xct_t*          tid_to_xct(const tid_t& tid);
01255     /**\brief Get the transaction ID for a given a transaction structure.
01256      *\ingroup SSMXCT
01257      * @param[in] x   Pointer to transaction structure.
01258      *\details
01259      * Return the transaction ID for the given transaction.
01260      */
01261     static tid_t           xct_to_tid(const xct_t* x);
01262 
01263     /**\brief Print transaction information to an output stream.
01264      *\ingroup SSMAPIDEBUG
01265      * @param[in] o   Stream to which to write the information.
01266      * \details
01267      * This is for debugging only, and is not thread-safe. 
01268      */
01269     static rc_t            dump_xcts(ostream &o);
01270 
01271     /**\brief Get the transaction state for a given transaction (structure).
01272      *\ingroup SSMXCT
01273      * @param[in] x   Pointer to transaction structure.
01274      * \details
01275      * Returns the state of the transaction (active, prepared). It is
01276      * hard to get the state of an aborted or committed transaction, since
01277      * their structures no longer exist.
01278      */
01279     static xct_state_t     state_xct(const xct_t* x);
01280 
01281     /**\brief Return the amount of log this transaction would consume
01282      * if it rolled back.
01283      *\ingroup SSMXCT
01284      *
01285      * If a transaction aborts with eOUTOFLOGSPACE this function can
01286      * be used in conjunction with xct_reserve_log_space to
01287      * pre-allocate the needed amount of log space before retrying.
01288      */
01289     static smlevel_0::fileoff_t        xct_log_space_needed();
01290 
01291     /**\brief Require the specified amount of log space to be
01292      * available for this transaction before continuing.
01293      *\ingroup SSMXCT
01294      *
01295      * If a transaction risks running out of log space it can
01296      * pre-request some or all of the needed amount before starting in
01297      * order to improve its chances of success. Other new transactions
01298      * will be unable to acquire log space before this request is
01299      * granted (existing ones will be able to commit, unless they also
01300      * run out of space, because that tends to free up log space and
01301      * avoids wasting work).
01302      */
01303     static rc_t            xct_reserve_log_space(fileoff_t amt);
01304     
01305     /**\brief Get the locking granularity for the attached transaction.
01306      * \ingroup SSMLOCK
01307      */
01308     static concurrency_t   xct_lock_level();
01309     /**\brief Set the default locking level for the attached transaction.
01310      * \ingroup SSMLOCK
01311      * \details
01312      * @param[in] l  The level to use for the balance of this transaction.
01313      * Legitimate values are t_cc_record,  t_cc_page,  t_cc_file.
01314      *
01315      * \note Only one thread may be attached to the transaction when this
01316      * is called. If more than one thread is attached, a fatal error
01317      * will ensue.
01318      */
01319     static void            set_xct_lock_level(concurrency_t l);
01320 
01321     /**\brief Collect transaction information in a virtual table.
01322      * \ingroup SSMVTABLE
01323      * \details
01324      * @param[out] v  The virtual table to populate.
01325      * @param[in] names_too  If true, make the 
01326      *            first row of the table a list of the attribute names.
01327      *
01328      * All attribute values will be strings.
01329      * The virtual table v can be printed with its output operator
01330      * operator<< for ostreams.
01331      *
01332      * \attention Not atomic. Can yield stale data. 
01333      */
01334     static rc_t            xct_collect(vtable_t&v, bool names_too=true);
01335 
01336     /**\brief Collect buffer pool information in a virtual table.
01337      * \ingroup SSMVTABLE
01338      * \details
01339      * @param[out] v  The virtual table to populate.
01340      * @param[in] names_too  If true, make the 
01341      *            first row of the table a list of the attribute names.
01342      *
01343      * \attention Be wary of using this with a large buffer pool.
01344      *
01345      * All attribute values will be strings.
01346      * The virtual table v can be printed with its output operator
01347      * operator<< for ostreams.
01348      *
01349      * \attention Not atomic. Can yield stale data. 
01350      */
01351     static rc_t            bp_collect(vtable_t&v, bool names_too=true);
01352 
01353     /**\brief Collect lock table information in a virtual table.
01354      * \ingroup SSMVTABLE
01355      * \details
01356      * @param[out] v  The virtual table to populate.
01357      * @param[in] names_too  If true, make the 
01358      *            first row of the table a list of the attribute names.
01359      *
01360      * All attribute values will be strings.
01361      * The virtual table v can be printed with its output operator
01362      * operator<< for ostreams.
01363      *
01364      * \attention Not atomic. Can yield stale data. 
01365      * Cannot be used in a multi-threaded-transaction context.
01366      */
01367     static rc_t            lock_collect(vtable_t&v, bool names_too=true);
01368 
01369     /**\brief Collect thread information in a virtual table.
01370      * \ingroup SSMVTABLE
01371      * \details
01372      * @param[out] v  The virtual table to populate.
01373      * @param[in] names_too  If true, make the 
01374      *            first row of the table a list of the attribute names.
01375      *
01376      * All attribute values will be strings.
01377      * The virtual table v can be printed with its output operator
01378      * operator<< for ostreams.
01379      *
01380      * \attention Not thread-safe. Can yield stale data. 
01381      */
01382     static rc_t            thread_collect(vtable_t&v, bool names_too=true);
01383 
01384     /**\brief Take a checkpoint.
01385      * \ingroup SSMAPIDEBUG
01386      * \note For debugging only!
01387      *
01388      * Force the storage manager to take a checkpoint.
01389      * Checkpoints are fuzzy : they can be taken while most other
01390      * storage manager activity is happening, even though they have
01391      * to be serialized with respect to each other, and with respect to
01392      * a few other activities.
01393      *
01394      * This is thread-safe.
01395      */
01396     static rc_t            checkpoint();
01397 
01398     /**\brief Force the buffer pool to flush its pages to disk.
01399      * \ingroup SSMAPIDEBUG
01400      * @param[in] invalidate   True means discard pages after flush.
01401      * \note For debugging only!
01402      * \attention Do not call force_buffers with anything pinned.
01403      * You may cause latch-latch deadlocks, as this method has
01404      * to scan the entire buffer pool and possibly EX-latch pages to prevent
01405      * others from updating while it forces to disk.
01406      * Since the page-order is essentially random, we cannot
01407      * preclude latch-latch deadlocks with other threads.
01408      */
01409     static rc_t            force_buffers(bool invalidate = false);
01410 
01411     /**\brief Force the buffer pool to flush the volume header page(s)
01412      * to disk.
01413      * \ingroup SSMAPIDEBUG
01414      * @param[in] vid   ID of the volume of interest
01415      * \note For debugging only!
01416      * \attention Do not call force_vol_hdr_buffers with anything pinned.
01417      * You could cause latch-latch deadlocks, as this method has
01418      * to scan the entire buffer pool and possibly EX-latch some pages.
01419      * Since the page-order is essentially random, we cannot
01420      * preclude latch-latch deadlocks with other threads.
01421      */
01422     static rc_t            force_vol_hdr_buffers( const vid_t&   vid);
01423 
01424     /**\brief Force the buffer pool to flush to disk all pages
01425      * for the given store.
01426      * \ingroup SSMAPIDEBUG
01427      * @param[in] stid   Store whose pages are to be flushed.
01428      * @param[in] invalidate   True means discard the pages after flushing.
01429      * \note For debugging only!
01430      * \attention Do not call force_store_buffers with anything pinned.
01431      * You may cause latch-latch deadlocks, as this method has
01432      * to scan the entire buffer pool and, if invalide==true,
01433      * EX-latch pages to prevent others from updating 
01434      * while it forces to disk.
01435      * Since the page-order is essentially random, we cannot
01436      * preclude latch-latch deadlocks with other threads.
01437      */
01438     static rc_t            force_store_buffers(const stid_t & stid,
01439                                                bool invalidate);
01440 
01441     /**\cond skip 
01442      * Do not document. Very un-thread-safe.
01443      */
01444     static rc_t            dump_buffers(ostream &o);
01445     static rc_t            dump_locks(ostream &o);
01446     static rc_t            dump_locks(); // defaults to std::cout
01447     static rc_t            dump_exts(ostream &o, 
01448         vid_t                    v, 
01449         extnum_t                 start, 
01450         extnum_t                 end);
01451 
01452     static rc_t            dump_stores(ostream &o, 
01453         vid_t                    v, 
01454         int                      start, 
01455         int                      end);
01456 
01457     static rc_t            dump_histo(ostream &o, bool locked);
01458 
01459     static rc_t            snapshot_buffers(
01460         u_int&                 ndirty, 
01461         u_int&                 nclean, 
01462         u_int&                 nfree,
01463         u_int&                 nfixed);
01464     /**\endcond skip */
01465 
01466     /**\brief Get a copy of the statistics from an attached instrumented transaction.
01467      * \ingroup SSMXCT
01468      * \details
01469      * @param[out] stats Returns a copy of the statistics for this transaction.
01470      * @param[in] reset  If true, the statistics for this transaction will be zeroed.
01471      */
01472     static rc_t            gather_xct_stats(
01473         sm_stats_info_t&       stats, 
01474         bool                   reset = false);
01475 
01476     /**\brief Get a copy of the global statistics.
01477      * \ingroup SSMSTATS
01478      * \details
01479      * @param[out] stats A pre-allocated structure.
01480      */
01481     static rc_t            gather_stats(
01482         sm_stats_info_t&       stats
01483         );
01484 
01485     /**\brief Get a copy of configuration-dependent information.
01486      * \ingroup OPT
01487      * \details
01488      * @param[out] info A pre-allocated structure.
01489      */
01490     static rc_t            config_info(sm_config_info_t& info);
01491 
01492     /**\brief Set sleep time before I/O operations.
01493      * \ingroup SSMVOL
01494      * \details
01495      * This method sets a milli_sec delay to occur before 
01496      * each disk read/write operation.  This is for debugging.
01497      * It is useful in discovering thread sync bugs.
01498      * This delay applies to all threads.
01499     */
01500     static rc_t            set_disk_delay(u_int milli_sec);
01501 
01502     /**\cond skip */
01503     // TODO : document crash testing facilities
01504     /**\brief Simulate a crash
01505      * \details
01506      * This method tells the log manager to start generating corrupted
01507      * log records.  This will make it appear that a crash occurred
01508      * at that point in the log.  A call to this method should be
01509      * followed immediately by a dirty shutdown of the ssm.
01510      */
01511     static rc_t            start_log_corruption();
01512 
01513     /* for smsh/debugging:   
01514      * log an arbitrary message */
01515     static rc_t            log_message(const char * const msg);
01516     /**\endcond skip */
01517 
01518     // Forces a log flush
01519     static rc_t            sync_log(bool block=true);
01520     static rc_t            flush_until(lsn_t& anlsn, bool block=true);
01521 
01522     // Allowing to access info about the important lsns (curr and durable)
01523     static rc_t            get_curr_lsn(lsn_t& anlsn);
01524     static rc_t            get_durable_lsn(lsn_t& anlsn);
01525 
01526 
01527     /*
01528        Device and Volume Management
01529        ----------------------------
01530        A device is either an operating system file or operating system
01531        device and is identified by a path name (absolute or relative).
01532        A device has a quota.  In theory, a device may have 
01533        multiple volumes on it but
01534        in the current implementation the maximum number of volumes
01535        is 1.
01536 
01537        A volume is where data is stored.  A volume is identified
01538        uniquely and persistently by a long volume ID (lvid_t).
01539        Volumes can be used whenever the device they are located
01540        on is mounted by the SM.  Volumes have a quota.  The
01541        sum of the quotas of all the volumes on a device cannot
01542        exceed the device quota.
01543 
01544        The basic steps to begin using a new device/volume are:
01545         format_dev: initialize the device
01546         mount_dev: allow use of the device and all its volumes
01547         generate_new_lvid: generate a unique ID for the volume
01548         create_vol: create a volume on the device
01549      */
01550 
01551     /*
01552      * Device management functions
01553      */
01554      /**\addtogroup SSMVOL 
01555       * The storage manager was designed to permit multiple \e volumes
01556       * on a \e device, with \e volume analogous to a Unix \e parition and
01557       * a \e device analogous to a disk, and the original SHORE contained
01558       * symmetric peer servers.  
01559       * However good that intention, multiple volumes on a device were never
01560       * implemented, and times have changed, and the storage manager no
01561       * longer has any notion of remote and local volumes.
01562       * The notion a volume, separate from a device, remains, but may
01563       * some day disappear.
01564       *
01565       * For the time being, a device contains at most one volume. 
01566       *
01567      * A device is either an operating system file or 
01568      * an operating system device (e.g., raw disk partition) and  
01569      * is identified by a path name (absolute or relative).
01570      *
01571      * A device has a quota.  
01572      * A device is intended to have multiple volumes on it, but
01573      * in the current implementation the maximum number of volumes
01574      * is exactly 1.
01575      *
01576      * A volume is where data are stored.  
01577      * Each volume is a header and a set of pages. All pages are
01578      * the same size (this is a compile-time constant, the default being
01579      * 8K and sizes up to 64K permissible).
01580      *
01581      * A volume is identified uniquely and persistently by a 
01582      * long volume ID (lvid_t), which is stored in its header.
01583      * Volumes can be used whenever the device they are located
01584      * on is mounted by the SM.  
01585      * Volumes have a quota.  The
01586      * sum of the quotas of all the volumes on a device cannot
01587      * exceed the device quota.
01588      *
01589      * A volume contains a variety of data structures. All user
01590      * data reside in \e stores.  A store is a collection of the
01591      * pages on the volume, allocated in \e extents of a size that
01592      * is a compile-time constant. (The storage manager has only
01593      * been tested with an extent-size of 8 pages. The compile-time constant
01594      * can be changed, but it also requires changes elsewhere in the code
01595      * to maintain alignment of persistent structures.
01596      * See the comments in config/shore.def.) Thus, the minimum size
01597      * of a store is one extent's worth of pages.
01598      * Larger extents provide better clustering, but more wasted space if
01599      * small files and small indexes will be common.
01600      *
01601      * Stores are identified by a store number (snum_t).
01602      *
01603      * Each volume contains a few stores that are "overhead":
01604      * 0 -- is reserved for an extent map and a store map
01605      * 1 -- directory (dir_m)
01606      * 2 -- root index 
01607      *
01608      * Beyond that, for each (user) file created, 2 stores are used, one for
01609      * small objects, one for large objects, and for each index (btree, rtree) 
01610      * created 1 store is used.
01611      *
01612      * Each volume is laid out thus:
01613      * - volume header, which identifies the number of extents on
01614      *   the volume, determined when the volume is formatted.
01615      *   This is always in page 1 of the volume.
01616      * - store map: some number of pages describing the stores on the volume,
01617      *   namely, being the heads of linked-lists of extents that make up
01618      *   the stores. The number of such pages is determined when the
01619      *   volume is formatted.  The worst case is assumed, which is one
01620      *   might fill the volume with one-extent stores.
01621      * - extent map: some number of pages of bitmaps, one bitmap for each 
01622      *   extent,  describe which pages in the extents are allocated or free.
01623      * - data pages: the rest of the volume.
01624      *
01625      */
01626 
01627     /**\brief Format a device.
01628      * \ingroup SSMVOL
01629      * \details
01630      * @param[in] device   Operating-system file name of the "device".
01631      * @param[in] quota_in_KB  Quota in kilobytes.
01632      * @param[in] force If true, format the device even if it already exists.
01633      *
01634      * Since raw devices always "exist", \a force should be given as true 
01635      * for raw devices.
01636      *
01637      * A device may not be formatted if it is already mounted.
01638      *
01639      * \note This method should \b not 
01640      * be called in the context of a transaction.
01641      */
01642     static rc_t            format_dev(
01643         const char*            device,
01644         smksize_t              quota_in_KB,
01645         bool                   force);
01646     
01647     /**\brief Mount a device.
01648      * \ingroup SSMVOL
01649      * \details
01650      * @param[in] device   Operating-system file name of the "device".
01651      * @param[out] vol_cnt Number of volumes on the device.
01652      * @param[out] devid  A local device id assigned by the storage manager.
01653      * @param[in] local_vid A local handle to the (only) volume on the device,
01654      * to be used when a volume is mounted.  The default, vid_t::null, 
01655      * indicates that the storage manager can chose a value for this. 
01656      *
01657      * \note It is fine to mount a device more than once, as long as device
01658      * is always the same (you cannot specify a hard link or soft link to
01659      * an entity mounted under a different path). 
01660      * Device mounts are \b not reference-counted, so a single dismount_dev
01661      * renders the volumes on the device unusable.
01662      *
01663      * \note This method should \b not 
01664      * be called in the context of a transaction.
01665      */
01666     static rc_t            mount_dev(
01667         const char*            device,
01668         u_int&                 vol_cnt,
01669         devid_t&               devid,
01670         vid_t                  local_vid = vid_t::null);
01671 
01672     /**\brief Dismount a device.
01673      * \ingroup SSMVOL
01674      * \details
01675      * @param[in] device   Operating-system file name of the "device".
01676      *
01677      * \note It is fine to mount a device more than once, as long as device
01678      * is always the same (you cannot specify a hard link or soft link to
01679      * an entity mounted under a different path). 
01680      * Device mounts are \b not reference-counted, so a single dismount_dev
01681      * renders the volumes on the device unusable.
01682      *
01683      * \note This method should \b not 
01684      * be called in the context of a transaction.
01685      */
01686 
01687     static rc_t            dismount_dev(const char* device);
01688 
01689     /**\brief Dismount all mounted devices.
01690      * \ingroup SSMVOL
01691      *
01692      * \note This method should \b not 
01693      * be called in the context of a transaction.
01694      */
01695     static rc_t            dismount_all();
01696 
01697     // list_devices returns an array of char* pointers to the names of
01698     // all mounted devices.  Note that the use of a char*'s is 
01699     // a temporary hack until a standard string class is available.
01700     // the char* pointers are pointing directly into the device
01701     // mount table.
01702     // dev_cnt is the length of the list returned.
01703     // dev_list and devid_list must be deleted with delete [] by the
01704     // caller if they are not null (0).  They should be null
01705     // if an error is returned or if there are no devices.
01706     /**\brief Return a list of all mounted devices.
01707      * \ingroup SSMVOL
01708      * \details
01709      * @param[out] dev_list   Returned list of pointers directly into the mount table.
01710      * @param[out] devid_list   Returned list of associated device ids.
01711      * @param[out] dev_cnt   Returned number of entries in the two above lists.
01712      *
01713      * The storage manager allocates the arrays returned with new[], and the
01714      * caller must return these to the heap with delete[] if they are not null.
01715      * They will be null if an error is returned or if no devices are mounted.
01716      *
01717      * The strings to which dev_list[*] point are \b not to be deleted by
01718      * the caller.
01719      */
01720     static rc_t            list_devices(
01721         const char**&            dev_list, 
01722         devid_t*&                devid_list, 
01723         u_int&                   dev_cnt);
01724 
01725     /**\brief Return a list of all volume on a device.
01726      * \ingroup SSMVOL
01727      * \details
01728      * @param[in] device   Operating-system file name of the "device".
01729      * @param[out] lvid_list   Returned list of pointers directly into the mount table.
01730      * @param[out] lvid_cnt   Returned length of list lvid_list.
01731      *
01732      * The storage manager allocates the array lvid_list 
01733      * with new[], and the
01734      * caller must return it to the heap with delete[] if it is not null.
01735      * It will be null if an error is returned. 
01736      *
01737      * \note This method should \b not 
01738      * be called in the context of a transaction.
01739      */
01740     static rc_t            list_volumes(
01741         const char*            device,
01742         lvid_t*&               lvid_list,
01743         u_int&                 lvid_cnt
01744     );
01745 
01746     // get_device_quota the "quota" (in KB) of the device
01747     // and the amount of the quota allocated to volumes on the device.
01748     /**\brief Get the device quota.
01749      * \ingroup SSMVOL
01750      * \details
01751      * @param[in] device   Operating-system file name of the "device".
01752      * @param[out] quota_KB   Returned quota in kilobytes
01753      * @param[out] quota_used_KB   Returned portion of quota allocated to volumes
01754      *
01755      * The quota_used_KB is the portion of the quota allocated to volumes on the device.
01756      *
01757      * \note This method \b may 
01758      * be called in the context of a transaction.
01759      *
01760      * \note This method \b may 
01761      * be called in the context of a transaction.
01762      */
01763     static rc_t            get_device_quota(
01764         const char*             device, 
01765         smksize_t&              quota_KB, 
01766         smksize_t&              quota_used_KB);
01767 
01768 
01769     /*
01770      * Volume management functions
01771      */
01772 
01773     /**\brief Change the fake disk latency before I/Os on this volume, 
01774      * for debugging purposes
01775      * \ingroup SSMVOL
01776      * \details
01777      * @param[in] vid  The ID of the volume of interest.
01778      * @param[in] adelay  Nanoseconds to sleep with ::nanosleep()
01779      *
01780      * This is for debugging only.
01781      * Changing the value of the latency for a volume does not enable the
01782      * delay.
01783      */
01784     static rc_t set_fake_disk_latency(vid_t vid, const int adelay);
01785 
01786     /**\brief Enable the fake disk latency before I/Os on this volume, for debugging purposes
01787      * \ingroup SSMVOL
01788      * \details
01789      * @param[in] vid  The ID of the volume of interest.
01790      *
01791      * This is for debugging only.
01792      * When this is enabled, is uses whatever disk latency was set with
01793      * ss_m::create_vol() or the last applied ss_m::set_fake_disk_latency().
01794      */
01795     static rc_t enable_fake_disk_latency(vid_t vid);
01796     /**\brief Disable the fake disk latency before I/Os on this volume, for debugging purposes
01797      * \ingroup SSMVOL
01798      * \details
01799      * @param[in] vid  The ID of the volume of interest.
01800      *
01801      * This is for debugging only.
01802      */
01803     static rc_t disable_fake_disk_latency(vid_t vid);
01804 
01805 
01806     /**\brief Add a volume to a device.
01807      * \ingroup SSMVOL
01808      * \details
01809      * @param[in] lvid  Long volume id to be used on ss_m::create_vol().
01810      * @param[in] hostname  Name to use for local host. Default is one of:
01811      * host name derived from gethostbyname/uname or "localhost.localdomain", 
01812      * depending on whether the requisite functions exist on the 
01813      * local machine.  Using a non-null
01814      * argument obviates the use of any other mechanism to derive a 
01815      * host name.  The host name is used to get a host address, 
01816      * which is part of the unique identifier.
01817      *
01818      * This generates a unique volume identifier to be written persistently
01819      * on the volume when it is formatted.
01820      * This enables us to avoid the mistake of doubly-mounting a volume.
01821      * The identifer is constructed from the machine network address and the
01822      * time of day.
01823      */
01824     static rc_t generate_new_lvid(lvid_t& lvid, const char *hostname=NULL);
01825      
01826     /**\brief Add a volume to a device.
01827      * \ingroup SSMVOL
01828      * \details
01829      * @param[in] device_name   Operating-system file name of the "device".
01830      * @param[in] lvid  Long volume id to use when formatting the new volume.
01831      * @param[in] quota_KB  Quota in kilobytes.
01832      * @param[in] skip_raw_init  Do not initialize the volume if on a raw device.
01833      * @param[in] local_vid Short volume id by which to refer to this volume.
01834      *            If null, the storage manager will assign one.
01835      * @param[in] apply_fake_io_latency See ss_m::enable_fake_disk_latency()
01836      * @param[in] fake_disk_latency See ss_m::set_fake_disk_latency()
01837      *
01838      * \note This method should \b not 
01839      * be called in the context of a transaction.
01840      *
01841      * The pages on the volume \b must be zeroed; you can only use
01842      * \a skip_raw_init = true if you have by some other means
01843      * already initialized the volume.
01844      */
01845     static rc_t            create_vol(
01846         const char*             device_name,
01847         const lvid_t&           lvid,
01848         smksize_t               quota_KB,
01849         bool                    skip_raw_init = false,
01850         vid_t                   local_vid = vid_t::null,
01851         const bool              apply_fake_io_latency = false,
01852         const int               fake_disk_latency = 0);
01853 
01854     /**\brief Destroy a volume.
01855      * \ingroup SSMVOL
01856      * \details
01857      * @param[in] lvid  Long volume id by which the volume is known.
01858      *
01859      * \note This method should \b not 
01860      * be called in the context of a transaction.
01861      */
01862     static rc_t            destroy_vol(const lvid_t& lvid);
01863 
01864     /**\brief Gets the quotas associated with the volume.
01865      * \ingroup SSMVOL
01866      * @param[in] lvid  Long volume id by which the volume is known.
01867      * @param[out] quota_KB  Quota given when the volume was created.
01868      * @param[out] quota_used_KB  Portion of the quota has been used by
01869      * allocated extents.
01870      */
01871     static rc_t            get_volume_quota(
01872         const lvid_t&             lvid, 
01873         smksize_t&                quota_KB, 
01874         smksize_t&                quota_used_KB);
01875 
01876     /**\cond skip */
01877     // check_volume_page_types: strictly for debugging/testing
01878     static rc_t             check_volume_page_types(vid_t vid);
01879     /**\endcond skip */
01880 
01881 
01882     /**\brief Analyze a volume and report statistics regarding disk usage.
01883      * \ingroup SSMVOL
01884      * @param[in] vid The volume of interest.
01885      * @param[out] du The structure that will hold the collected statistics.
01886      * @param[in] audit If "true", the method acquires a share lock on the
01887      * volume and then will check assertions about the
01888      * correctness of the data structures on the volume. 
01889      * If the audit fails an internal fatal error is generated 
01890      * to facilitate debugging. (It will generate a core file if your
01891      * shell permits such.)
01892      * If "false" an IS lock is acquired, which means that the
01893      * statistics will be fuzzy.
01894      *
01895      * Using the audit feature is useful for debugging.
01896      * It is the only safe way to use this method.
01897      * \note The statistics are added to the sm_du_stats_t structure passed in.
01898      * This structure is not cleared by the storage manager.
01899      */
01900     static rc_t            get_du_statistics(
01901         vid_t                 vid,
01902         sm_du_stats_t&        du,
01903         bool                  audit = true); 
01904 
01905     /**\brief Analyze a store and report statistics regarding disk usage.
01906      * \ingroup SSMVOL
01907      * @param[in] stid The store of interest.
01908      * @param[out] du The structure that will hold the collected statistics.
01909      * @param[in] audit If "true", the method acquires a share lock on the
01910      * store and then will check assertions about the
01911      * correctness of the data structures on the store. 
01912      *
01913      * Using the audit feature is useful for debugging.
01914      * It is the only safe way to use this method.
01915      *
01916      */
01917     static rc_t            get_du_statistics(
01918         const stid_t&        stid, 
01919         sm_du_stats_t&       du,
01920         bool                 audit = true);
01921     
01922     /**\brief Dump disk information about the indicated volume.
01923      * \ingroup SSMVOL
01924      * @param[in] vid The volume of interest.
01925      *
01926      * This function is for debugging.
01927      * It dumps, to the error log, at info_prio priority,
01928      * metadata about the given volume, including the number of extents
01929      * on the volume, the extent size, and the number of pages dedicated
01930      * to store maps and extent maps. Then, for each store on the volume,
01931      * it dumps the status of the store and the extents allocated to 
01932      * that store.
01933      *
01934      * This function must be run in a transaction, though the function
01935      * is read-only.
01936      */
01937     static rc_t            dump_vol_store_info(const vid_t &vid);
01938 
01939     /**\brief Analyze  a volume and collect brief statistics about its usage.
01940      * \ingroup SSMVOL
01941      * @param[in] vid The volume of interest.
01942      * @param[out] volume_stats The statistics are written here.
01943      * @param[in] cc Indicates whether the volume is to be locked 
01944      * by this method. Acceptable values are t_cc_none and t_cc_volume.
01945      *
01946      * If no lock is acquired, the method can fail with eRETRY.
01947      *
01948      */
01949     static rc_t            get_volume_meta_stats(
01950         vid_t                vid,
01951         SmVolumeMetaStats&   volume_stats,
01952         concurrency_t        cc = t_cc_none
01953     );
01954 
01955     /**\brief Analyze  a volume and collect brief statistics about its usage.
01956      * \ingroup SSMVOL
01957      * @param[in] vid The volume of interest.
01958      * @param[in] num_files The size of the array file_stats.
01959      * @param[out] file_stats Preallocated array of structs into which to
01960      * write the statistics for the individual files inspected.
01961      * @param[in] batch_calculate  True means make one pass over the volume.
01962      * @param[in] cc Indicates whether the volume is to be locked 
01963      * by this method. Acceptable values are t_cc_none and t_cc_volume.
01964      *
01965      * If no lock is acquired and batch_calculate is not set, 
01966      * the method can fail with eRETRY.
01967      *
01968      *
01969      * If batch_calculate is true then this works by making one pass
01970      * over the meta data, but it looks at all the meta data.  This
01971      * should be the faster way to do the analysis when there are 
01972      * many files, and when files use a large portion of the volume.
01973      *
01974      * If batch_calculate is false then each file is updated
01975      * indidually, only looking at the extent information for that
01976      * particular file. This requires a pass over the volume for each
01977      * file. (Seek-wise it is less efficient).
01978      *
01979      */
01980     static rc_t            get_file_meta_stats(
01981         vid_t                vid,
01982         w_base_t::uint4_t    num_files,
01983         SmFileMetaStats*     file_stats,
01984         bool                 batch_calculate = false,
01985         concurrency_t        cc = t_cc_none
01986     );
01987    
01988     /**\brief Get the index ID of the root index of the volume.
01989      * \ingroup SSMVOL
01990      *
01991      * @param[in] v Volume of interest.
01992      * @param[out] iid Store ID of the root index.
01993      * \details
01994      *
01995      * Each volume has a root index, which is a well-known
01996      * index available to the server for bootstrapping a database.
01997      *
01998      */
01999     static rc_t            vol_root_index(
02000         const vid_t&        v, 
02001         stid_t&             iid
02002     )    { iid.vol = v; iid.store = store_id_root_index; return RCOK; }
02003 
02004     /*****************************************************************
02005      * storage operations: smfile.cpp
02006      *****************************************************************/
02007     /**\addtogroup SSMSTORE 
02008      * Indexes and files are special cases of "stores".
02009      * A store is a linked list of extents, and an extent is a
02010      * contiguous group of pages.  So the store is the structure
02011      * that holds together an ordered set of pages that can be
02012      * used by a server and have an identifier (a store ID or stid_t).
02013      *
02014      * Indexes and files of records are built on stores.
02015      *
02016      * Stores have logging properties and 
02017      * other metadata associated with them.
02018      * 
02019      * The property that determines the logging level of the store is
02020      * \ref sm_store_property_t.
02021      *
02022      * Methods that let you get and change the metatdata are:
02023      * - ss_m::get_store_property
02024      * - ss_m::set_store_property
02025      * - ss_m::get_store_info
02026      * - \ref snum_t
02027      *
02028      * When a transaction deletes a file or index, the deletion of the
02029      * underlying stores is delayed until the transaction commits so that
02030      * the pages allocated to the stores remain reserved (lest the
02031      * transaction aborts). The deleting transaction could, in theory,
02032      * reuse the pages for another store, but in practice that is not done.
02033      * Instead, when a store is deleted, the store is marked
02034      * for deletion an put in a list for the transaction to delete upon
02035      * commit.   At commit time, stores that have property t_load_file
02036      * or t_insert_file are converted to t_regular.
02037      */
02038 
02039     /**\brief Change the store property of a file or index.
02040      * \ingroup SSMSTORE
02041      * @param[in] stid   File ID or index ID of the store to change.
02042      * @param[in] property   Enumeration store_property_t (alias for
02043      *                   smlevel_3::sm_store_property_t, q.v.)
02044      *
02045      * \details
02046      * The possible uses of store properties are described with 
02047      * smlevel_3::sm_store_property_t.
02048      */
02049     static rc_t            set_store_property(
02050         stid_t                stid,
02051         store_property_t      property
02052         );
02053 
02054     /**\brief Get the store property of a file or index.
02055      * \ingroup SSMSTORE
02056      * @param[in] stid   File ID or index ID of the store of interest.
02057      * @param[in] property   Reference to enumeration store_property_t 
02058      *                  (alias for smlevel_3::sm_store_property_t, q.v.)
02059      *
02060      * \details
02061      * The possible uses of store properties are described with 
02062      * smlevel_3::sm_store_property_t.
02063      */
02064     static rc_t            get_store_property(
02065         stid_t                stid,
02066         store_property_t&     property);
02067 
02068     /**\brief Get various store information of a file or index.
02069      * \ingroup SSMSTORE
02070      * @param[in] stid   File ID or index ID of the store of interest.
02071      * @param[out] info  Reference to sm_store_info_t into which to
02072      * write the results.
02073      *
02074      * \details
02075      * Get internally stored information about a store.
02076      */
02077     static rc_t            get_store_info( 
02078         const stid_t&         stid, 
02079         sm_store_info_t&      info);
02080 
02081     //
02082     // Functions for B+tree Indexes
02083     //
02084     /**\addtogroup SSMBTREE 
02085      * The storage manager supports B+-Tree indexes provide associative access 
02086      * to data by associating keys with values in 1:1 or many:1 relationships.
02087      * Keys may be composed of any of the basic C-language types (integer,
02088      * unsigned, floating-point of several sizes) or
02089      * variable-length character strings (wide characters are \b not supported).
02090      *
02091      * The number of key-value pairs that an index can hold is limited by the
02092      * space available on the volume containing the index.
02093      * \anchor max_entry_size 
02094      * The combined sizes of the key and value must
02095      * be less than or equal to \ref max_entry_size, which is
02096      * a function of the page size, and is 
02097      * such that two entries of this size fit on a page along with all
02098      * the page and entry metadata.  See sm_config_info_t and ss_m::config_info.
02099      *
02100      * The minimum size of a B-Tree index is 8 pages (1 extent).
02101      *
02102      * A variety of locking protocols is supported:
02103      * - none : acquire no locks on the {key,value} pairs in the index,
02104      *   although an intention lock might be acquired on the index.
02105      * - kvl : key-value locking See \ref MOH1.  The key or
02106      *   key-value pair is hashed into a 4-byte value and used with the
02107      *   given store id to make a lock id.
02108      * - im : index-management locking See \ref MOH1.  
02109      *   The "value" portion of
02110      *   the key-value lock is taken to be a record id, which is used 
02111      *   for the lock id.
02112      * - modified kvl : an ad-hoc protocol used by the Paradise project. See \ref MODKVL "the scan_index_i constructor". As with index-management locking, 
02113      *   the "value" portion of
02114      *   the key-value lock is taken to be a record id, which is used 
02115      *   for the lock id.
02116      * - file : full-index locking.
02117      *
02118      * \section key_description Key Types
02119      * A B+-Tree index key has a type determined when the index is created.
02120      * All keys are stored in lexicographic format based on an interpretation of
02121      * the key determined by the key description given when the index is
02122      * created.
02123      * Lookups on the B+-Tree then involve a single byte-by-byte
02124      * comparison of two byte-strings, each composed of its concatenated
02125      * sub-keys.
02126      *
02127      * The key description is a null-terminated string as follows:
02128      \verbatim
02129      <key_decription>     ::=  <fixed_len_part>*  <variable_len_part>  |
02130                                <fixed_len_part>+ 
02131      <fixed_len_part>     ::=  <type> <len> 
02132      <variable_len_part>  ::=  <type> '*' <len>
02133      <type>               ::=  'i' | 'u' | 'f' | 'b' | 'I' | 'U' | 'F' | 'B'
02134      <len>                ::=   [1-9][0-9]*
02135      \endverbatim
02136      * Thus, a key may have any number of fixed-length parts followed by at
02137      * most one variable-length part.
02138      *
02139      * The fixed-length parts (if present) consist of a type and a length.
02140      *
02141      * The variable-length part (if present) consists of a type and a length
02142      * separated by an asterisk, which is what distinguishes a variable-length
02143      * from a fixed-length part.
02144      *
02145      * Types and permissible lengths are:
02146      * - integer (1,2,4,8)
02147      * - unsigned (1,2,4,8)
02148      * - floating (4,8)
02149      * - uninterpreted byte (any length greater than zero)
02150      *
02151      * A capital letter indicates that the key part may be compressed. Only prefix
02152      * compression is implemented, so it makes sense to compress if the
02153      * first part of the key is compressible.
02154      *
02155      * Examples:
02156      * - "B40u4u2u2" : 40-byte character string followed by a 4-byte integer,
02157      *                 a 2-byte integer and a 2-byte integer, such as one might
02158      *                 use for name.year.mo.day.  The character string is
02159      *                 prefix-compressed.
02160      * - "f8"        : an 8-byte floating-point number (double)
02161      * - "I8B*1000"  : An 8-byte integer followed by an uninterpreted string
02162      *                 of up to 1000 bytes, all prefix-compressed.
02163      *
02164      * \note Wide characters are not supported.
02165      *
02166      * This key descriptor is stored in the sm_store_info_t, which is
02167      * stored on the volume and is available with the method ss_m::get_store_info.
02168      * Keys are stored in \ref LEXICOFORMAT "lexicographic format". The
02169      * storage manager knows how to convert all the key types listed above.
02170      * When duplicates are permitted, the index assumes that the elements
02171      * are in lexicographic order when searching for a <key,element> pair.
02172      *
02173      * \section XXXX1 Bulk Loading 
02174      * Bulk-loading of all index types is supported. See \ref SSMBULKLD.
02175      */
02176 
02177 
02178     /**\brief Create a B+-Tree index.
02179      * \ingroup SSMBTREE
02180      * @param[in] vid   Volume on which to create the index.
02181      * @param[in] ntype   Type of index. Legitimate values are: 
02182      *  - t_btree : B+-Tree with duplicate keys allowed
02183      *  - t_uni_btree : B+-Tree without duplicate keys 
02184      * @param[in] property Logging level of store. Legitimate values are:
02185      *  - t_regular
02186      *  - t_load_file
02187      *  - t_insert_file
02188      *  See sm_store_property_t for details.
02189      * @param[in] key_desc Description of key type.
02190      *  See \ref key_description for details.
02191      * @param[in] cc The locking protocol to use with this index. See
02192      * smlevel_0::concurrency_t and \ref SSMBTREE.
02193      * @param[out] stid New store ID will be returned here.
02194      */
02195     static rc_t            create_index(
02196                 vid_t                 vid, 
02197                 ndx_t                 ntype, 
02198                 store_property_t      property,
02199                 const char*           key_desc,
02200                 concurrency_t         cc, 
02201                 stid_t&               stid
02202     );
02203 
02204     /**\brief Create a B+-Tree or R*-Tree index.
02205      * \ingroup SSMBTREE
02206      *\attention For backward compatibility. Will be deprecated later.
02207      */
02208     static rc_t            create_index(
02209                 vid_t                 vid, 
02210                 ndx_t                 ntype, 
02211                 store_property_t      property,
02212                 const char*           key_desc,
02213                 stid_t&               stid
02214     );
02215 
02216     /**\brief Destroy a B+-Tree index.
02217      * \ingroup SSMBTREE
02218      *
02219      * @param[in] iid  ID of the index to be destroyed.
02220      */
02221     static rc_t            destroy_index(const stid_t& iid); 
02222 
02223     /**\brief Bulk-load a B+-Tree index from multiple data sources.
02224      * \ingroup SSMBULKLD
02225      *
02226      * @param[in] stid  ID of the index to be loaded.
02227      * @param[in] nsrcs  Number of files used for data sources.
02228      * @param[in] source  Array of IDs of files used for data sources.
02229      * @param[out] stats  Statistics concerning the load activity will be
02230      *                     written here.
02231      * @param[in] sort_duplicates  If "true" the bulk-load will sort
02232      * duplicates by value.
02233      * @param[in] lexify_keys  If "true" the keys are assumed not to
02234      * be in 
02235      * lexicographic format, and the bulk-load will reformat the key before
02236      * storing it in the index,
02237      * otherwise they are assumed already to be in lexicographic format.
02238      *
02239      * \anchor LEXICOFORMAT 
02240      * \b Lexicographic \b format
02241      * is the translation of numbers 
02242      * (int, float, double, unsigned, etc) into byte strings
02243      * such that a lexicographic comparison of the byte strings
02244      * yields the same result as the numeric comparison of the
02245      * original data.
02246      *
02247      * \note The data must already have been sorted by 
02248      * key in lexicographic format, but the keys themselves don't have
02249      * to be in lexicographic format; if the keys are not already in
02250      * lexicographic format, the \a lexify_keys must be given the value "true".
02251      *
02252      * In the case of duplicate keys, the bulk-load will handle the
02253      * sorting of the elements if \a sort_duplicates is "true"; this
02254      * sort will be done by a lexicographic comparison of the 
02255      * byte strings that compose the elements.
02256      */
02257     static rc_t            bulkld_index(
02258         const stid_t&             stid, 
02259         int                       nsrcs,
02260         const stid_t*             source,
02261         sm_du_stats_t&            stats,
02262         bool                      sort_duplicates = true,
02263         bool                      lexify_keys = true
02264     );
02265     /**\brief Bulk-load a B+-Tree index from a single data source.
02266      * \ingroup SSMBULKLD
02267      *
02268      * @param[in] stid  ID of the index to be loaded.
02269      * @param[in] source  IDs of file used for data source.
02270      * @param[out] stats  Statistics concerning the load activity will be
02271      *                     written here.
02272      * @param[in] sort_duplicates  If "true" the bulk-load will sort
02273      * duplicates by value.
02274      * @param[in] lexify_keys  If "true" the keys are assumed not to
02275      * be in 
02276      * lexicographic format, and the bulk-load will reformat the key before
02277      * storing it in the index,
02278      * otherwise they are assumed already to be in lexicographic format.
02279      */
02280     static rc_t            bulkld_index(
02281         const stid_t&             stid, 
02282         const stid_t&             source,
02283         sm_du_stats_t&            stats,
02284         bool                      sort_duplicates = true,
02285         bool                      lexify_keys = true
02286     );
02287     /**\brief Bulk-load a B+-Tree index from a single data stream.
02288      * \ingroup SSMBULKLD
02289      *
02290      * @param[in] stid  ID of the index to be loaded.
02291      * @param[in] sorted_stream  Iterator that serves as the data source.
02292      * @param[out] stats  Statistics concerning the load activity will be
02293      *                     written here.
02294      *
02295      * See sort_stream_i.
02296      */
02297     static rc_t            bulkld_index(
02298         const stid_t&             stid, 
02299         sort_stream_i&            sorted_stream,
02300         sm_du_stats_t&            stats);
02301 
02302     /**\cond skip */
02303     static rc_t            print_index(stid_t stid);
02304     /**\endcond skip */
02305 
02306     /**\brief Create an entry in a B+-Tree index.
02307      * \ingroup SSMBTREE
02308      *
02309      * @param[in] stid  ID of the index. 
02310      * @param[in] key  Key for the association to be created.
02311      * @param[in] el  Element for the association to be created.
02312      *
02313      * The combined sizes of the key and element vectors must
02314      * be less than or equal to \ref max_entry_size.
02315      */
02316     static rc_t            create_assoc(
02317         stid_t                   stid, 
02318         const vec_t&             key, 
02319         const vec_t&             el
02320 #ifdef SM_DORA
02321         , const bool             bIgnoreLocks = false
02322 #endif
02323     );
02324     /**\brief Remove an entry from a B+-Tree index.
02325      * If your index is non-unique (i.e., it may contain
02326      * multiple entries per key), use destroy_all_assoc.
02327      *
02328      * \ingroup SSMBTREE
02329      *
02330      * @param[in] stid  ID of the index. 
02331      * @param[in] key   Key of the entry to be removed.
02332      * @param[in] el   Element (value) of the entry to be removed.
02333      */
02334     static rc_t            destroy_assoc(
02335         stid_t                   stid, 
02336         const vec_t&             key,
02337         const vec_t&             el
02338 #ifdef SM_DORA
02339         , const bool             bIgnoreLocks = false
02340 #endif
02341     );
02342     /**\brief Destroy all entries associated with a key in a B+-Tree index.
02343      * \ingroup SSMBTREE
02344      *
02345      * @param[in] stid  ID of the index. 
02346      * @param[in] key   Key of the entries to be removed.
02347      * @param[out] num_removed   The number of entries removed is returned here.
02348      */
02349     static rc_t            destroy_all_assoc(
02350         stid_t                  stid, 
02351         const vec_t&            key,
02352         int&                    num_removed
02353     );
02354     /**\brief Find an entry associated with a key in a B+-Tree index. 
02355      * \ingroup SSMBTREE
02356      *
02357      * @param[in] stid  ID of the index. 
02358      * @param[in] key   Key of the entries to be removed.
02359      * @param[out] el   Element associated with the given key will be copied into this buffer.
02360      * @param[in] elen Length of buffer into which the 
02361      *                  result will be written. If too small, eRECWONTFIT will
02362      *                  be returned.
02363      *                 Length of result will be returned here.
02364      * @param[out] found   True if an entry is found.
02365      *
02366      * If the index is not unique (allows duplicates), the first
02367      * element found with the given key will be returned.
02368      *
02369      * To locate all entries associated with a non-unique key, you must
02370      * use scan_index_i, q.v.. 
02371      */
02372     static rc_t            find_assoc(
02373         stid_t                  stid, 
02374         const vec_t&            key, 
02375         void*                   el, 
02376         smsize_t&               elen, 
02377         bool&                   found
02378 #ifdef SM_DORA
02379         , const bool             bIgnoreLocks = false
02380 #endif
02381     );
02382 
02383     //
02384     // Functions for R*tree (multi-dimensional(MD), spatial) Indexes
02385     //
02386 
02387     /**\addtogroup SSMRTREE 
02388      *
02389      * An R-tree is a height-balanced structure designed for indexing
02390      * multi-dimensional spatial objects.  
02391      * It stores the minimial bounding box (with 2 or higher dimension) of 
02392      * a spatial object as the key in the leaf pages.
02393      * This implementation is a variant of an R-Tree called an R*-Tree, which
02394      * improves the search performance by using a heuristic for redistributing
02395      * entries and dynamically reorganizing the tree during insertion.
02396      *
02397      * An R*-Tree stores key,value pairs where the key is of type nbox_t
02398      * and the value is of type vec_t.
02399      *
02400      * The number of key-value pairs an index can hold is limited by the space
02401      * available on the volume containing the index.
02402      * The minimum size of an R*-tree index is 8 pages.
02403      *
02404      * 
02405      * \note This implementation 
02406      * uses coarse-grained (index-level) locking and 
02407      * supports only 2 dimensions and integer coordinates.
02408      * For information about R*-trees, see the \ref BKSS.
02409      *
02410      * Example:
02411      * \code
02412      scan_rt_i scan(idx, nbox_t::t_overlap, universe, true);
02413      bool      eof;
02414      nbox_t    k;
02415      char*     e;
02416      smsize_t  elen;
02417 
02418      for(int i=0; 
02419              (!(rc = scanp->next(k,e,elen,eof)).is_error() && !eof);
02420              i++) ;
02421      cout << "Rtree " << idx << " contains " << i << " entries." << endl;
02422      \endcode
02423      * 
02424      *
02425      * \section XXXX2 Bulk Loading 
02426      * Bulk-loading of all index types is supported. See \ref SSMBULKLD.
02427      */
02428      /*\example rtree_example.cpp*/
02429 
02430 
02431     /**\brief Create an R*-Tree (multi-dimensional spatial) index.
02432      * The storage manager does not provide
02433      * complete support for non-unique multidimensional indexes.
02434      * While you may insert multiple (distinct) entries for the same key in 
02435      * a multi-dimensional index, you will not be able to use them; only
02436      * the first can be retrieved.  
02437      * \ingroup SSMRTREE
02438      * @param[in] vid   Volume on which to create the index.
02439      * @param[in] ntype   Type of index. Legitimate values are: 
02440      *  - t_rtree : R*-Tree 
02441      * @param[in] property Logging level of store. Legitimate values are:
02442      *  - t_temporary
02443      *  - t_regular
02444      *  - t_load_file
02445      *  - t_insert_file
02446      *  See sm_store_property_t for details.
02447      * @param[in] dim Number of dimensions of the key.
02448      * They key type is an nbox_t.
02449      * See \ref nbox_t for details. 
02450      * @param[out] stid New store ID will be returned here.
02451      */
02452     static rc_t            create_md_index(
02453         vid_t                   vid, 
02454         ndx_t                   ntype, 
02455         store_property_t        property,
02456         stid_t&                 stid, 
02457         int2_t                  dim = 2
02458     );
02459 
02460     /**\brief Destroy an R*-Tree index.
02461      * \ingroup SSMRTREE
02462      *
02463      * @param[in] iid  ID of the index to be destroyed.
02464      */
02465     static rc_t            destroy_md_index(const stid_t& iid);
02466 
02467     /**\brief Bulk-load a multi-dimensional index from multiple sources.
02468      * \ingroup SSMBULKLD
02469      * @param[in] stid  ID of the index to be loaded.
02470      * @param[in] nsrcs  Number of files used for data sources.
02471      * @param[in] source  Array of IDs of files used for data sources.
02472      * @param[out] stats  Statistics concerning the load activity will be
02473      *                     written here.
02474      * @param[in] hff   Heuristic fill factor. Not used.
02475      * @param[in] hef   Heuristic expansion factor. Not used.
02476      * @param[in] universe  Universal bounding box of all spatial objects indexed.
02477     */
02478     static rc_t            bulkld_md_index(
02479         const stid_t&             stid, 
02480         int                       nsrcs,
02481         const stid_t*             source, 
02482         sm_du_stats_t&            stats,
02483         int2_t                    hff=75,
02484         int2_t                    hef=120,
02485         nbox_t*                   universe=NULL);
02486 
02487     /**\brief Bulk-load a multi-dimensional index from a single source.
02488      * The storage manager does not provide
02489      * complete support for non-unique multidimensional indexes.
02490      * While you may insert multiple (distinct) entries for the same key in 
02491      * a multi-dimensional index, you will not be able to use them; only
02492      * the first can be retrieved.  
02493      * \ingroup SSMBULKLD
02494      * @param[in] stid  ID of the index to be loaded.
02495      * @param[in] source  ID of file to be used for data source.
02496      * @param[out] stats  Statistics concerning the load activity will be
02497      *                     written here.
02498      * @param[in] hff   Heuristic fill factor. Not used.
02499      * @param[in] hef   Heuristic expansion factor. Not used.
02500      * @param[in] universe  Universal bounding box of all spatial objects indexed.
02501     */
02502     static rc_t            bulkld_md_index(
02503         const stid_t&             stid, 
02504         const stid_t&             source, 
02505         sm_du_stats_t&            stats,
02506         int2_t                    hff=75,
02507         int2_t                    hef=120,
02508         nbox_t*                   universe=NULL);
02509 
02510     /**\brief Bulk-load a multi-dimensional index from a sorted stream source.
02511      * The storage manager does not provide
02512      * complete support for non-unique multidimensional indexes.
02513      * While you may insert multiple (distinct) entries for the same key in 
02514      * a multi-dimensional index, you will not be able to use them; only
02515      * the first can be retrieved.  
02516      * \ingroup SSMBULKLD
02517      * @param[in] stid  ID of the index to be loaded.
02518      * @param[in] sorted_stream  Input stream that is data source.
02519      * @param[out] stats  Statistics concerning the load activity will be
02520      *                     written here.
02521      * @param[in] hff   Heuristic fill factor. Not used.
02522      * @param[in] hef   Heuristic expansion factor. Not used.
02523      * @param[in] universe  Universal bounding box of all spatial objects indexed.
02524     */
02525     static rc_t            bulkld_md_index(
02526         const stid_t&             stid, 
02527         sort_stream_i&            sorted_stream,
02528         sm_du_stats_t&            stats,
02529         int2_t                    hff=75,
02530         int2_t                    hef=120,
02531         nbox_t*                   universe=NULL);
02532 
02533     /**\brief Print a representation of the rtree.
02534      * \ingroup SSMRTREE
02535      * @param[in] stid  ID of the index to be printed.
02536      * @param[in] out   I/O stream to which to write the output.
02537     */
02538     static rc_t            print_md_index(stid_t stid, ostream &out);
02539 
02540     /**\brief Look up an entry in a multi-dimensional index.
02541      * \ingroup SSMRTREE
02542      *
02543      * @param[in] stid  ID of the index. 
02544      * @param[in] key   Key associated with the entry to look up.
02545      * @param[out] el   Element associated with the given key will be copied into this buffer.
02546      * @param[in] elen Length of buffer into which the 
02547      *                  result will be written. If too small, eRECWONTFIT will
02548      *                  be returned.
02549      *                 Length of result will be returned here.
02550      * @param[out] found   True if an entry is found.
02551      *
02552      * If the index is not unique (allows duplicates), the first
02553      * element found with the given key will be returned.
02554      *
02555      * The storage manager does not provide a method to locate all 
02556      * entries associated with a non-unique key.
02557      */
02558     static rc_t            find_md_assoc(
02559         stid_t                    stid, 
02560         const nbox_t&             key, 
02561         void*                     el, 
02562         smsize_t&                 elen, 
02563         bool&                     found);
02564 
02565     /**\brief Create an entry in a multi-dimensional index.
02566      * The storage manager does not provide
02567      * complete support for non-unique multidimensional indexes.
02568      * While you may insert multiple (distinct) entries for the same key in 
02569      * a multi-dimensional index, you will not be able to use them; only
02570      * the first can be retrieved.  
02571      * \ingroup SSMRTREE
02572      *
02573      * @param[in] stid  ID of the index. 
02574      * @param[in] key  Key for the association to be created.
02575      * @param[in] el  Element for the association to be created.
02576     */
02577     static rc_t            create_md_assoc(
02578         stid_t                    stid, 
02579         const nbox_t&             key,
02580         const vec_t&              el);
02581 
02582     /**\brief Destroy an entry in a multi-dimensional index.
02583      * \ingroup SSMRTREE
02584      *
02585      * @param[in] stid  ID of the index. 
02586      * @param[in] key   Key of the entry to be removed.
02587      * @param[in] el   Element (value) of the entry to be removed.
02588     */
02589     static rc_t            destroy_md_assoc(
02590         stid_t                    stid, 
02591         const nbox_t&             key,
02592         const vec_t&              el);
02593 
02594     /**\cond skip */
02595     // for debugging
02596     static rc_t            draw_rtree(const stid_t& stid, ostream &);
02597     /**\endcond skip */
02598 
02599     /**\brief Gather usage statistics about an R*-Tree index.
02600      * \ingroup SSMRTREE
02601      * @param[in] stid  ID of the index. 
02602      * @param[out] stat  Usage statistics will be written here.
02603      * @param[in] size  Number of uint2_t's in the array ovp.
02604      * @param[out] ovp   Pre-allocated array of integers into which
02605      * the method will write the overlap percentages for each level of the
02606      * tree.
02607      * @param[in] audit If "true", the method 
02608      * will check assertions about the
02609      * correctness of the rtree.
02610      * If the audit fails an internal fatal error is generated 
02611      * to facilitate debugging. (It will generate a core file if your
02612      * shell permits such.)
02613      *
02614      * \note for debugging
02615     */
02616     static rc_t            rtree_stats(
02617         const stid_t&             stid,
02618         rtree_stats_t&            stat,
02619         uint2_t                   size = 0,
02620         uint2_t*                  ovp = NULL,
02621         bool                      audit = false);
02622 
02623     /**\addtogroup SSMFILE 
02624      * You can create, destroy, and scan files of records. You may exert some
02625      * control over the order in which records appear in the file (a physical
02626      * scan), but, in general, the storage manager decides where to put records.
02627      *
02628      * Pages in a file are slotted pages: Each page contains an array of
02629      * slots.
02630      * Records take one of three forms: small, large, and very large.
02631      * - Small records fit in the slots on the file pages.
02632      * - Large records are too big to fit on a slotted page, so they are put
02633      * elsewhere, and the slots point to these records.  Actually, what is
02634      * in a slot is a small array of page pointers to the data of the large record.
02635      * - A very large record is one whose slot in the file page contains
02636      *   a single reference to a page that is an index of data pages.
02637      *
02638      * Because records may take these forms, the API for creating records
02639      * contains the opportunity for you to provide a hint about the ultimate
02640      * size of the record so that the storage manager can create the proper
02641      * structure for the record immediately, rather than creating a small
02642      * record that is soon to be converted to a large, then a very large record
02643      * by subsequent appends. 
02644      *
02645      * All records contain a client-defined header.  This is for the convenience
02646      * of server-writers.  The header must fit on the slotted page, so it should
02647      * never be very large.
02648      *
02649      * The following methods manipulate files of records and the records found 
02650      * there.
02651      *
02652      * Modules below describe file traversal and
02653      * appending to files (\ref SSMSCANF), 
02654      * and pinning individual records in the buffer pool for extended operations 
02655      * (\ref SSMPIN).
02656      *
02657      * \section UNINIT Uninitialized Data
02658      * The functions create_rec, append_rec, and update_rec can be used to
02659      * write blocks of data that are all zeroes,  with minimal logging. 
02660      * This is useful for creating records of known size but with uninitialized data.  
02661      * The type zvec_t, a special case of vec_t, is for this purpose. 
02662      * Construct it with only a size, as follows:
02663      * \code
02664      * zvec_t zdata(100000);
02665      * \endcode
02666      * The underlying logging code recognizes that this is a vector of zeroes and
02667      * logs only a count, not the data themselves. 
02668      *
02669      * \section Errors
02670      * If an error occurs in the middle of one of these methods that is updating persistent data,
02671      * the record or file \e could be in an inconsistent state. 
02672      * The caller has the choice of aborting the transaction or rolling back to the nearest savepoint (see \ref SSMXCT).
02673      *
02674      * \sa SSMSCAN, SSMPIN, vec_t, zvec_t, IDs.
02675      */
02676     
02677     /**\brief Create a file of records.
02678      * \ingroup SSMFILE
02679      * \details
02680      * @param[in] vid   Volume on which to create a file.
02681      * @param[out] fid  Returns (store) ID of the new file here.
02682      * @param[in] property Give the file the this property.
02683      * @param[in] cluster_hint Not used. 
02684      *
02685      * The cluster hint is included in the API for future use. 
02686      * It has no effect.
02687      */
02688     static rc_t            create_file( 
02689         vid_t                   vid, 
02690         stid_t&                 fid,
02691         store_property_t        property,
02692         shpid_t                 cluster_hint = 0
02693     ); 
02694 
02695     /**\brief Destroy a file of records.
02696      * \ingroup SSMFILE
02697      * \details
02698      * @param[in] fid  ID of the file to destroy.
02699      */
02700     static rc_t            destroy_file(const stid_t& fid); 
02701 
02702     /**\brief Create a new record.
02703      * \ingroup SSMFILE
02704      * \details
02705      * @param[in] fid  ID of the file in which to create a record.
02706      * @param[in] hdr  What to put in the record's header.
02707      * @param[in] len_hint  Hint about how big the record will ultimately be.
02708      * This is used to determine the initial format of the record. If you plan
02709      * to append to the record and know that it will ultimately become a large
02710      * record, it is more efficient to give a size hint that is larger than
02711      * a page here. Otherwise, the record will be made small (as determined by
02712      * the size of the parameter \a data ), and subsequent appends will cause 
02713      * the record to be converted to a large record.
02714      * @param[in] data  What to put in the record's body. 
02715      * @param[out] new_rid  ID of the newly created record.
02716      * @param[in] policy  File compaction policy to use. See \ref pg_policy_t
02717      * for possible values.
02718      */
02719     static rc_t            create_rec(
02720         const stid_t&            fid, 
02721         const vec_t&             hdr, 
02722         smsize_t                 len_hint, 
02723         const vec_t&             data, 
02724         rid_t&                   new_rid,
02725 #ifdef SM_DORA
02726         const bool               bIgnoreLocks = false,
02727 #endif
02728         uint4_t                  policy = t_cache | t_compact | t_append
02729     ); 
02730 
02731     /**\brief Destroy a record.
02732      * \ingroup SSMFILE
02733      * \details
02734      * @param[in] rid  ID of the record to destroy.
02735      */
02736     static rc_t            destroy_rec(const rid_t& rid
02737 #ifdef SM_DORA
02738         , const bool             bIgnoreLocks = false
02739 #endif
02740                                        );
02741 
02742     /**\brief Modify the body of an existing record.
02743      * \ingroup SSMFILE
02744      * \details
02745      * @param[in] rid  ID of the record to modify.
02746      * @param[in] start  First byte to change.
02747      * @param[in] data  What to put in the record's body.  
02748      *
02749      * This overwrites
02750      * the existing bytes, starting at the offset \a start through the
02751      * byte at \a start + \a data.size().
02752      * This method \b cannot \b be \b used to change the size of a record.
02753      * Attempting this will result in an error.
02754      */
02755     static rc_t            update_rec(
02756         const rid_t&             rid, 
02757         smsize_t                 start, 
02758         const vec_t&             data);
02759 
02760     /**\brief Modify the header of an existing record.
02761      * \ingroup SSMFILE
02762      * \details
02763      * @param[in] rid  ID of the record to modify.
02764      * @param[in] start  First byte to change.
02765      * @param[in] hdr  What to put in the record's header.  
02766      *
02767      * This overwrites
02768      * the existing bytes, starting at the offset \a start through the
02769      * byte at \a start + \a data.size().
02770      * This method \b cannot \b be \b used to change the size of a record
02771      * header. There are no methods for appending to or truncating a
02772      * record header.
02773      *
02774      * \sa pin_i::update_rec, \ref SSMPIN
02775      */
02776     static rc_t            update_rec_hdr(
02777         const rid_t&             rid, 
02778         smsize_t                 start, 
02779         const vec_t&             hdr);
02780     // see also pin_i::update_rec*()
02781 
02782     /**\brief Append bytes to a record body.
02783      * \ingroup SSMFILE
02784      * \details
02785      * @param[in] rid  ID of the record to modify.
02786      * @param[in] data  What to append to the record.
02787      *
02788      * \note This appends \b to a record; it does \b not append a record to a file!
02789      * \sa pin_i::append_rec, \ref SSMPIN
02790      */
02791     static rc_t            append_rec(
02792         const rid_t&             rid, 
02793         const vec_t&             data
02794                 );
02795 
02796     /**\brief Chop bytes off the end of a record body.
02797      * \ingroup SSMFILE
02798      * \details
02799      * @param[in] rid  ID of the record to modify.
02800      * @param[in] amount  How many bytes to lop off.
02801      *
02802      * \sa pin_i::truncate_rec, \ref SSMPIN
02803      */
02804     static rc_t            truncate_rec(
02805         const rid_t&             rid, 
02806         smsize_t                 amount
02807     );
02808 
02809     /**\brief Chop bytes off the end of a record body.
02810      * \ingroup SSMFILE
02811      * \details
02812      * @param[in] rid  ID of the record to modify.
02813      * @param[in] amount  How many bytes to lop off.
02814      * @param[out] should_forward  Returns true if the record started out
02815      * large but is now small as a result of the truncation.  
02816      * This enables a value-added server to take action in this event,
02817      * should it so desire.
02818      *
02819      * \sa pin_i::truncate_rec, \ref SSMPIN
02820      */
02821     static rc_t            truncate_rec(
02822         const rid_t&             rid, 
02823         smsize_t                 amount,
02824         bool&                    should_forward 
02825     );
02826 
02827 #ifdef OLDSORT_COMPATIBILITY
02828     typedef ssm_sort::key_info_t key_info_t;
02829 
02830     /* old sort physical version */
02831     /**\brief Sort a file. Deprecated.
02832      * \details
02833      */
02834     static rc_t            sort_file(
02835         const stid_t&             fid, 
02836         vid_t                     vid, 
02837         stid_t&                   sfid, 
02838         store_property_t          property,
02839         const key_info_t&         key_info, 
02840         int                       run_size,
02841         bool                      ascending = true,
02842         bool                      unique = false,
02843         bool                      destructive = false,
02844         bool                      use_new_sort = true);
02845 
02846     /**\brief Sort a file. Deprecated.
02847      * \details
02848      */
02849     static rc_t            new_sort_file(
02850         const stid_t&             fid, 
02851         vid_t                     vid, 
02852         stid_t&                   sfid, 
02853         store_property_t          property,
02854         const key_info_t&         key_info, 
02855         int                       run_size,
02856         bool                      ascending = true,
02857         bool                      unique = false,
02858         bool                      destructive = false
02859         );
02860 #endif /* OLDSORT_COMPATIBILITY */
02861 
02862     typedef ssm_sort::sort_keys_t sort_keys_t;
02863 
02864     /* new sort physical version : see notes below */
02865     /**\brief Sort a file.
02866      * \ingroup SSMSORT
02867      * @param[in] fid File to sort.
02868      * @param[in] sorted_fid File to which to write the results. 
02869      * @param[in] nvids Size of array \a vid.
02870      * @param[in] vid Array of IDs of scratch files created by the caller.
02871      * @param[in] kl See sort_keys_t.
02872      * @param[in] min_rec_sz Hint of minimum record size in input file.
02873      * @param[in] run_size Number of pages in buffer pool to use for a run. 
02874      * @param[in] temp_space Number of pages to use for scratch space.
02875      * (This limits the amount of memory used by the sort).
02876      *
02877      * \details
02878      * Before you call sort_file, you must create an output file \a sorted_fid
02879      * into which sort_file will write the results.
02880      *
02881      * The sort uses temporary files when the input file contains more records
02882      * than can fit in one run (determined by \a run_size). These temporary files
02883      * may be spread across multiple volumes, which is useful if the
02884      * volumes reside on different spindles.  The arguments \a nvids
02885      * and \a vid are for indicating the volumes to use for these scratch
02886      * files.
02887      *
02888      * The caller can provide a clue in \a min_rec_size
02889      * about the minimum record size of the
02890      * input file, which can help the sort's efficiency.
02891      *
02892      * The \a run_size indicates how many buffer-pool pages to use
02893      * for each run.
02894      * Since at all times one page is fixed for output, while the rest are 
02895      * for reading the input in runs, the real run size is \a run_size-1.
02896      * 
02897      */
02898     static rc_t            sort_file(
02899         const stid_t&            fid,     // input file
02900         const stid_t&            sorted_fid, // output file 
02901         int                      nvids,    // array size for vids
02902         const vid_t*             vid,     // array of vids for temp
02903                         // files
02904                         // created by caller--
02905                         // can be same as input file
02906         sort_keys_t&            kl, // kl &
02907         smsize_t                min_rec_sz, // for estimating space use
02908         int                     run_size,   // # pages to use for a run
02909         int                     temp_space // # pages VM to use for scratch 
02910     );
02911 
02912     /**\brief Return the short volume ID of a volume.
02913      * \ingroup SSMVOL
02914      *
02915      * @param[in] lvid Long (persistent) volume ID found on the volume's
02916      * header.
02917      * @param[out] vid Short volume ID of a mounted volume.
02918      */
02919     static rc_t            lvid_to_vid(
02920         const lvid_t&          lvid,
02921         vid_t&                 vid);
02922 
02923     /**\brief Return the long volume ID of a volume.
02924      * \ingroup SSMVOL
02925      *
02926      * @param[in] vid Short volume ID of a mounted volume.
02927      * @param[out] lvid Long (persistent) volume ID found on the volume's
02928      * header.
02929      */
02930     static rc_t            vid_to_lvid(
02931         vid_t                  vid,
02932         lvid_t&                lvid);
02933 
02934     /*****************************************************************
02935      * Locking related functions
02936      *
02937      * NOTE: there are standard conversions from lpid_t, rid_t, and
02938      *       stid_t to lockid_t, so wherever a lockid_t parameter is
02939      *         specified a lpid_t, rid_t, or stid_t can be used.
02940      *
02941      *****************************************************************/
02942 
02943 #ifdef SLI_HOOKS
02944     /* enable/disable SLI globally for all threads created after this
02945        point. Does *NOT* disable SLI for existing threads.
02946      */
02947     static void            set_sli_enabled(bool enabled);
02948     static void            set_elr_enabled(bool enabled);
02949 
02950     static rc_t            set_log_features(char const* features);
02951     static char const*         get_log_features();
02952 #endif
02953 
02954     /**\brief Acquire a lock.
02955      * \ingroup SSMLOCK
02956      * @param[in]  n  Lock id of the entity to lock. There are
02957      * conversions from record ids, volume ids, store ids, and page ids to
02958      * lockid_t.
02959      * @param[in]  m  Desired lock mode.  Values: EX, SH.
02960      * @param[in]  d  Desired duration.  Values: 
02961      * - t_very_long : Held across transaction boundaries; 
02962      *             cannot be released by unlock()
02963      * - t_long : Released at commit; cannot be released by unlock()
02964      * - t_medium : May be released early by explicit unlock()
02965      * - t_short  : May be released early by explicit unlock()
02966      * - t_instant : Not held: acquired and released immediately.  Useful
02967      *             to see if any other transaction holds an incompatible lock.
02968      * @param[in]  timeout  Milliseconds willing to block.  See timeout_in_ms.
02969      *
02970      * The lock manager is written with these durations in mind, but the
02971      * only durations used by the storage manager are t_instant and t_long.
02972      * Medium-duration locks are used internally in a one place.  
02973      *
02974      * Durations other than long and instant are not well-tested.
02975      */
02976     static rc_t            lock(
02977         const lockid_t&         n, 
02978         lock_mode_t             m,
02979         lock_duration_t         d = t_long,
02980         timeout_in_ms           timeout = WAIT_SPECIFIED_BY_XCT
02981     );
02982     
02983     /**\brief Release a lock.
02984      * \ingroup SSMLOCK
02985      * @param[in]  n  Lock id of the entity to lock. There are
02986      * conversions from record ids, volume ids, store ids, and page ids to
02987      * lockid_t.
02988      */
02989     static rc_t            unlock(const lockid_t& n);
02990 
02991     /**\brief  Disable lock escalation on the given entity. 
02992      * \ingroup SSMLOCK
02993      * @param[in]  n  Lock id of the entity to lock. There are
02994      * conversions from record ids, volume ids, store ids, and page ids to
02995      * lockid_t.
02996      * @param[in]  passOnToDescendants If true, apply this to the descendants
02997      * of \a n.
02998      */
02999     static rc_t            dont_escalate(
03000         const lockid_t&           n,
03001         bool                      passOnToDescendants = true
03002     );
03003 
03004     /**\brief  Find the storage-manager-wide escalation thresholds
03005      * \ingroup SSMLOCK
03006      * Default values (used for all transactions until they change
03007      * their per-transaction thresholds) are determined by the
03008      * storage-manager-wide options.
03009      * See \ref SSMOPT.
03010      */
03011     static rc_t            get_escalation_thresholds(
03012         w_base_t::int4_t&        toPage,
03013         w_base_t::int4_t&        toStore,
03014         w_base_t::int4_t&        toVolume);
03015 
03016     /**\brief  Change the storage-manager-wide escalation thresholds
03017      * \ingroup SSMLOCK
03018      * Default values (used for all transactions until they change
03019      * their per-transaction thresholds) are determined by the
03020      * storage-manager-wide options.
03021      * See \ref SSMOPT.
03022      */
03023     static rc_t            set_escalation_thresholds(
03024         w_base_t::int4_t       toPage,
03025         w_base_t::int4_t       toStore,
03026         w_base_t::int4_t       toVolume);
03027 
03028     /**\brief  Find out if the attached transaction has an entity locked.
03029      * \ingroup SSMLOCK
03030      * @param[in]  n  Lock id of the entity to lock. There are
03031      * conversions from record ids, volume ids, store ids, and page ids to
03032      * lockid_t.
03033      * @param[out]  m  Mode of lock held. NL if none.
03034      * @param[in]  implicit If "true" the query will returns a lock mode if
03035      * an implicit lock is held, otherwise the lock must be held explicitly.
03036      */
03037     static rc_t            query_lock(
03038         const lockid_t&        n, 
03039         lock_mode_t&           m,
03040         bool                   implicit = false
03041     );
03042 
03043     /*****************************************************************
03044      * Lock Cache related functions
03045      *
03046      * Each transaction has a cache of recently acquired locks
03047      * The following functions control the use of the cache.
03048      * Note that the functions affect the transaction currently
03049      * associated with the thread.
03050      *****************************************************************/
03051     // turn on(enable=true) or  off/(enable=false) the lock cache 
03052     // return previous state.
03053     /**\brief Control  lock caching for attached transaction.
03054      * \ingroup SSMLOCK
03055      *
03056      * @param[in] enable Set to true if you want to turn on lock caching
03057      * for the attached transaction.  The default is that it is turned on.
03058      *
03059      * Only long-duration locks are cached.
03060      * Lock caching can be turned off by default using the 
03061      * sm_lock_caching option.  Even with it turned off by default, it
03062      * can be turned on for a given transcation with this method.
03063      *
03064      */
03065     static rc_t            set_lock_cache_enable(bool enable);
03066 
03067     /**\brief True if lock cache is enabled for the attached transaction 
03068      * \ingroup SSMLOCK
03069      *
03070      * @param[out] enabled Will be set to true if the attached transaction has
03071      * lock caching enabled, false otherwise.
03072      */
03073     static rc_t            lock_cache_enabled(bool& enabled);
03074 
03075 private:
03076 
03077     static int _instance_cnt;
03078     static option_group_t* _options;
03079     static option_t* _hugetlbfs_path;
03080     static option_t* _reformat_log;
03081     static option_t* _prefetch;
03082     static option_t* _bufpoolsize;
03083     static option_t* _locktablesize;
03084     static option_t* _logdir;
03085     static option_t* _logsize;
03086     static option_t* _logbufsize;
03087     static option_t* _error_log;
03088     static option_t* _error_loglevel;
03089     static option_t* _lockEscalateToPageThreshold;
03090     static option_t* _lockEscalateToStoreThreshold;
03091     static option_t* _lockEscalateToVolumeThreshold;
03092     static option_t* _cc_alg_option;
03093     static option_t* _log_warn_percent;
03094     static option_t* _num_page_writers;
03095     static option_t* _logging;
03096     static option_t* _lock_caching_default;
03097 
03098 
03099     static rc_t            _set_option_logsize(
03100         option_t*              opt,
03101         const char*            value,
03102         ostream*               err_stream);
03103     
03104     static rc_t            _set_option_lock_escalate_to_page(
03105         option_t*              opt,
03106         const char*            value,
03107         ostream*               err_stream);
03108     
03109     static rc_t            _set_option_lock_escalate_to_store(
03110         option_t*              opt,
03111         const char*            value,
03112         ostream*               err_stream);
03113     
03114     static rc_t            _set_option_lock_escalate_to_volume(
03115         option_t*              opt,
03116         const char*            value,
03117         ostream*               err_stream);
03118     
03119     static rc_t            _set_store_property(
03120         stid_t                stid,
03121         store_property_t      property);
03122 
03123     static rc_t            _get_store_property(
03124         stid_t                stid,
03125         store_property_t&     property);
03126 
03127     static rc_t         _begin_xct(
03128         sm_stats_info_t*      stats,  // allocated by caller
03129         tid_t&                tid, 
03130         timeout_in_ms         timeout);
03131 
03132     static rc_t            _commit_xct(
03133         sm_stats_info_t*&     stats,
03134         bool                  lazy,
03135         lsn_t* plastlsn);
03136 
03137     static rc_t            _commit_xct_group(
03138         xct_t *               list[],
03139         int                   listlen);
03140 
03141     static rc_t            _prepare_xct(
03142         sm_stats_info_t*&     stats,
03143         vote_t&                v);
03144 
03145     static rc_t            _set_coordinator(const server_handle_t &); 
03146     
03147     static rc_t            _enter_2pc(const gtid_t &); 
03148     static rc_t            _force_vote_readonly(); 
03149     static rc_t            _recover_2pc(const gtid_t &,// in
03150                                 bool    mayblock,
03151                                 tid_t    &    //out -- attached if found(?)
03152                             );
03153     static rc_t            _chain_xct(
03154         sm_stats_info_t*&      stats,
03155         bool                   lazy);
03156 
03157     static rc_t            _abort_xct(
03158         sm_stats_info_t*&      stats);
03159 
03160     static rc_t            _save_work(sm_save_point_t& sp);
03161 
03162     static rc_t            _rollback_work(const sm_save_point_t&        sp);
03163     static rc_t            _mount_dev(
03164         const char*            device,
03165         u_int&                 vol_cnt,
03166         vid_t                  local_vid);
03167 
03168     static rc_t            _dismount_dev(
03169         const char*            device,
03170         bool                   dismount_if_locked = true
03171     );
03172     static rc_t            _create_vol(
03173         const char*            device_name,
03174         const lvid_t&          lvid,
03175         smksize_t              quota_KB,
03176         bool                   skip_raw_init,
03177         const bool             apply_fake_io_latency,
03178         const int              fake_disk_latency);
03179 
03180     static rc_t            _create_index(
03181         vid_t                 vid, 
03182         ndx_t                 ntype, 
03183         store_property_t      property,
03184         const char*           key_desc,
03185         concurrency_t         cc,
03186         stid_t&               stid
03187     );
03188 
03189     static rc_t            _destroy_index(const stid_t& iid); 
03190 
03191     static rc_t            _get_store_info( 
03192         const stid_t  &       stid, 
03193         sm_store_info_t&      info);
03194 
03195     static rc_t            _bulkld_index(
03196         const stid_t&         stid,
03197         int                   nsrcs,
03198         const stid_t*         source,
03199         sm_du_stats_t&        stats,
03200         bool                  sort_duplicates = true,
03201         bool                  lexify_keys = true
03202     );
03203 
03204     static rc_t            _bulkld_index(
03205         const stid_t&          stid, 
03206         sort_stream_i&         sorted_stream,
03207         sm_du_stats_t&         stats
03208     );
03209 
03210     static rc_t            _print_index(const stid_t &iid);
03211 
03212     static rc_t            _create_assoc(
03213         const stid_t  &        stid, 
03214         const vec_t&           key, 
03215         const vec_t&           el
03216 #ifdef SM_DORA
03217         , const bool             bIgnoreLocks = false
03218 #endif
03219     );
03220 
03221     static rc_t            _destroy_assoc(
03222         const stid_t &        stid, 
03223         const vec_t&          key,
03224         const vec_t&          el
03225 #ifdef SM_DORA
03226         , const bool             bIgnoreLocks = false
03227 #endif
03228     );
03229 
03230     static rc_t            _destroy_all_assoc(
03231         const stid_t&        stid, 
03232         const vec_t&         key,
03233         int&                 num_removed
03234     );
03235     static rc_t            _find_assoc(
03236         const stid_t&        stid, 
03237         const vec_t&         key, 
03238         void*                el, 
03239         smsize_t&            elen, 
03240         bool&                found
03241 #ifdef SM_DORA
03242         , const bool             bIgnoreLocks = false
03243 #endif
03244     );
03245 
03246     // below method overloaded for rtree
03247     static rc_t            _create_md_index(
03248         vid_t                 vid, 
03249         ndx_t                 ntype, 
03250         store_property_t      property,
03251         stid_t&               stid, 
03252         int2_t                dim=2
03253     );
03254 
03255     static rc_t            _destroy_md_index(const stid_t& iid);
03256 
03257     static rc_t            _destroy_md_assoc(
03258         stid_t                stid,
03259         const nbox_t&         key,
03260         const vec_t&          el);
03261 
03262     static rc_t            _bulkld_md_index(
03263         const stid_t&         stid, 
03264         int                   nsrcs,
03265         const stid_t*         source, 
03266         sm_du_stats_t&        stats,
03267         int2_t                hff,           // for rtree only
03268         int2_t                hef,           // for rtree only
03269         nbox_t*               universe);// for rtree only
03270 
03271     static rc_t            _bulkld_md_index(
03272         const stid_t&         stid, 
03273         sort_stream_i&        sorted_stream,
03274         sm_du_stats_t&        stats,
03275         int2_t                hff,           // for rtree only
03276         int2_t                hef,           // for rtree only
03277         nbox_t*               universe);// for rtree only
03278 
03279     static rc_t            _print_md_index(stid_t stid, ostream &);
03280 
03281     static rc_t            _create_md_assoc(
03282         stid_t                stid, 
03283         const nbox_t&         key,
03284         const vec_t&          el);
03285 
03286     static rc_t            _find_md_assoc(
03287         stid_t                stid, 
03288         const nbox_t&         key, 
03289         void*                 el, 
03290         smsize_t&             elen, 
03291         bool&                 found);
03292 
03293     //
03294     // The following functions deal with files of records.
03295     //
03296     static rc_t            _destroy_n_swap_file(
03297         const stid_t&         old_fid,
03298         const stid_t&         new_fid);
03299 
03300     static rc_t            _create_file(
03301         vid_t                 vid, 
03302         stid_t&               fid,
03303         store_property_t     property,
03304         shpid_t              cluster_hint = 0
03305     ); 
03306 
03307     static rc_t            _destroy_file(const stid_t& fid); 
03308 
03309     static rc_t            _create_rec(
03310         const stid_t&            fid, 
03311         const vec_t&             hdr, 
03312         smsize_t                 len_hint, 
03313         const vec_t&             data, 
03314         rid_t&                   new_rid,
03315         uint4_t                  policy 
03316 #ifdef SM_DORA
03317         , const bool             bIgnoreLocks = false
03318 #endif
03319         ); 
03320 
03321     static rc_t            _destroy_rec(
03322         const rid_t&             rid
03323 #ifdef SM_DORA
03324         , const bool             bIgnoreLocks = false
03325 #endif
03326         );
03327 
03328     static rc_t            _update_rec(
03329         const rid_t&             rid, 
03330         smsize_t                 start, 
03331         const vec_t&             data
03332 #ifdef SM_DORA
03333         , const bool             bIgnoreLocks = false
03334 #endif
03335         );
03336 
03337     static rc_t            _update_rec_hdr(
03338         const rid_t&             rid, 
03339         smsize_t                 start, 
03340         const vec_t&             hdr
03341 #ifdef SM_DORA
03342         , const bool             bIgnoreLocks = false
03343 #endif
03344         );
03345 
03346     static rc_t            _append_rec(
03347         const rid_t&             rid, 
03348         const vec_t&             data
03349         );
03350 
03351     static rc_t            _truncate_rec(
03352             const rid_t&         rid, 
03353             smsize_t             amount,
03354             bool&                should_forward
03355         );
03356 
03357     static rc_t            _draw_rtree(const stid_t& stid, ostream &);
03358 
03359     static rc_t            _rtree_stats(
03360             const stid_t&       stid,
03361             rtree_stats_t&      stat,
03362             uint2_t             size,
03363             uint2_t*            ovp,
03364             bool                audit
03365         );
03366 
03367 #ifdef OLDSORT_COMPATIBILITY
03368     /* old sort internal, physical */
03369     static rc_t            _sort_file(
03370         const stid_t&           fid, 
03371         vid_t                   vid, 
03372         stid_t&                 sfid, 
03373         store_property_t        property,
03374         const key_info_t&       key_info, 
03375         int                     run_size,
03376         bool                    ascending,
03377         bool                    unique,
03378         bool                    destructive
03379     );
03380 #endif /* OLDSORT_COMPATIBILITY */
03381 
03382     /* new sort internal, physical */
03383     static rc_t            _sort_file(
03384         const stid_t&             fid,     // input file
03385         const stid_t&             sorted_fid, // output file -- 
03386                         // created by caller--
03387                         // can be same as input file
03388         int                      nvids,    // array size for vids
03389         const vid_t*             vid,     // array of vids for temp
03390         sort_keys_t&             kl,     // key location info &
03391         smsize_t                 min_rec_sz, // for estimating space use
03392         int                      run_size,   // # pages to use for a run
03393         int                      temp_space //# pages VM to use for scratch 
03394     );
03395 
03396 
03397 #ifdef OLDSORT_COMPATIBILITY
03398     /* internal compatibility old sort-> new sort */
03399     static rc_t            _new_sort_file(
03400             const stid_t&         in_fid, 
03401             const stid_t&         out_fid, 
03402             const key_info_t&    ki, 
03403             int                  run_size,
03404             bool                  ascending, 
03405             bool                  unique, 
03406             bool                  keep_orig //!destructive
03407             ); 
03408 #endif /* OLDSORT_COMPATIBILITY */
03409 
03410     static store_flag_t     _make_store_flag(store_property_t property);
03411     // reverse function:
03412     // static store_property_t    _make_store_property(w_base_t::uint4_t flag);
03413     // is in dir_vol_m
03414 
03415     // this is for df statistics  DU DF
03416     static rc_t            _get_du_statistics(
03417         vid_t                  vid, 
03418         sm_du_stats_t&         du,
03419         bool                   audit);
03420 
03421     static rc_t            _get_du_statistics(
03422         const stid_t  &        stid, 
03423         sm_du_stats_t&         du,
03424         bool                   audit);
03425 
03426     static rc_t            _get_volume_meta_stats(
03427         vid_t                  vid,
03428         SmVolumeMetaStats&     volume_stats,
03429         concurrency_t          cc);
03430 
03431     static rc_t            _get_file_meta_stats(
03432         vid_t                  vid,
03433         w_base_t::uint4_t      num_files,
03434         SmFileMetaStats*       file_stats,
03435         bool                   batch_calculate,
03436         concurrency_t          cc);
03437 };
03438 
03439 /**\brief Information about a store that can be queried by the client.
03440  * \details
03441  * This information is stored in a store directory on the volume.
03442  * It can be queried with ss_m::get_store_info.
03443  */
03444 class sm_store_info_t {
03445 public:
03446     NORET sm_store_info_t(int len) :
03447                 store(0), stype(ss_m::t_bad_store_t), 
03448                 ntype(ss_m::t_bad_ndx_t), cc(ss_m::t_cc_bad),
03449                 eff(0), large_store(0), root(0),
03450                 nkc(0), keydescrlen(len)
03451                 {  keydescr = new char[len]; }
03452 
03453     NORET ~sm_store_info_t() { if (keydescr) delete[] keydescr; }
03454 
03455     /// store number
03456     snum_t    store;        
03457     /// t_index, t_file, ... See ss_m::store_t.
03458     u_char    stype;        
03459     /// t_btree, t_rtree,... See ss_m::ndx_t
03460     u_char    ntype;        
03461     /// t_cc_kvl, t_cc_record,... See ss_m::concurrency_t
03462     u_char    cc;         
03463 
03464     /// Unused:
03465     u_char    eff;        
03466 
03467     /// Store number for associated large-page store, if there is one.
03468     snum_t    large_store; 
03469     /// Root page if this is an index.
03470     shpid_t    root;        
03471     /// Number of key components if this is an index.
03472     w_base_t::uint4_t    nkc;  
03473     /// Size of key description (if this is an index)
03474     int        keydescrlen;    
03475     /**\brief Variable length string.
03476      *
03477      * He who creates a sm_store_info_t for use with get_store_info()
03478      * is responsible for allocating enough space for 
03479      * key descriptors if he expects to find them.
03480      * See \ref key_description.
03481      */
03482     char        *keydescr;    
03483 };
03484 
03485 
03486 ostream& operator<<(ostream& o, const vid_t& v);
03487 istream& operator>>(istream& i, vid_t& v);
03488 ostream& operator<<(ostream& o, const extid_t& x);
03489 istream& operator>>(istream& o, extid_t &x);
03490 ostream& operator<<(ostream& o, const stid_t& stid);
03491 istream& operator>>(istream& i, stid_t& stid);
03492 ostream& operator<<(ostream& o, const lpid_t& pid);
03493 istream& operator>>(istream& i, lpid_t& pid);
03494 ostream& operator<<(ostream& o, const shrid_t& r);
03495 istream& operator>>(istream& i, shrid_t& r);
03496 ostream& operator<<(ostream& o, const rid_t& rid);
03497 istream& operator>>(istream& i, rid_t& rid);
03498 ostream& operator<<(ostream& o, const sm_stats_info_t& s);
03499 template<class ostream>
03500 ostream& operator<<(ostream& o, const sm_config_info_t& s)
03501 {
03502     o    << "  page_size " << s.page_size
03503      << "  max_small_rec " << s.max_small_rec
03504      << "  lg_rec_page_space " << s.lg_rec_page_space
03505      << "  buffer_pool_size " << s.buffer_pool_size
03506      << "  max_btree_entry_size " << s.max_btree_entry_size
03507      << "  exts_on_page " << s.exts_on_page
03508      << "  pages_per_ext " << s.pages_per_ext
03509      << "  logging " << s.logging
03510       ;
03511     return o;
03512 }
03513 
03514 
03515 #ifndef VEC_T_H
03516 #include <vec_t.h>
03517 #endif
03518 
03519 #ifndef SM_ESCALATION_H
03520 #include <sm_escalation.h>
03521 #endif
03522 
03523 /*<std-footer incl-file-exclusion='SM_H'>  -- do not edit anything below this line -- */
03524 
03525 #endif          /*</std-footer>*/

Generated on Mon Jan 2 15:13:57 2012 for Shore Storage Manager by  doxygen 1.4.7