sm.h

00001 /* -*- mode:C++; c-basic-offset:4 -*-
00002      Shore-MT -- Multi-threaded port of the SHORE storage manager
00003    
00004                        Copyright (c) 2007-2009
00005       Data Intensive Applications and Systems Labaratory (DIAS)
00006                Ecole Polytechnique Federale de Lausanne
00007    
00008                          All Rights Reserved.
00009    
00010    Permission to use, copy, modify and distribute this software and
00011    its documentation is hereby granted, provided that both the
00012    copyright notice and this permission notice appear in all copies of
00013    the software, derivative works or modified versions, and any
00014    portions thereof, and that both notices appear in supporting
00015    documentation.
00016    
00017    This code is distributed in the hope that it will be useful, but
00018    WITHOUT ANY WARRANTY; without even the implied warranty of
00019    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS
00020    DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER
00021    RESULTING FROM THE USE OF THIS SOFTWARE.
00022 */
00023 
00024 /*<std-header orig-src='shore' incl-file-exclusion='SM_H'>
00025 
00026  $Id: sm.h,v 1.322 2010/10/27 17:04:23 nhall Exp $
00027 
00028 SHORE -- Scalable Heterogeneous Object REpository
00029 
00030 Copyright (c) 1994-99 Computer Sciences Department, University of
00031                       Wisconsin -- Madison
00032 All Rights Reserved.
00033 
00034 Permission to use, copy, modify and distribute this software and its
00035 documentation is hereby granted, provided that both the copyright
00036 notice and this permission notice appear in all copies of the
00037 software, derivative works or modified versions, and any portions
00038 thereof, and that both notices appear in supporting documentation.
00039 
00040 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY
00041 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS
00042 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND
00043 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
00044 
00045 This software was developed with support by the Advanced Research
00046 Project Agency, ARPA order number 018 (formerly 8230), monitored by
00047 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518.
00048 Further funding for this work was provided by DARPA through
00049 Rome Research Laboratory Contract No. F30602-97-2-0247.
00050 
00051 */
00052 
00053 #ifndef SM_H
00054 #define SM_H
00055 
00056 #include "w_defines.h"
00057 
00058 /*  -- do not edit anything above this line --   </std-header>*/
00059 
00060 /*
00061  *  Stuff needed by value-added servers.  NOT meant to be included by
00062  *  internal SM .c files, except to the extent that they need these
00063  *  definitions used in the API.
00064  */
00065 
00066 #ifdef __GNUG__
00067 #pragma interface
00068 #endif
00069 
00070 #ifndef SM_INT_4_H
00071 #include <sm_int_4.h>
00072 #endif
00073 
00074 #ifndef SM_DU_STATS_H
00075 #include <sm_du_stats.h> // declares sm_du_stats_t
00076 #endif
00077 
00078 #ifndef SM_STATS_H
00079 #include <smstats.h> // declares sm_stats_info_t and sm_config_info_t
00080 #endif
00081 
00082 #ifndef SM_S_H
00083 #include <sm_s.h> // declares key_type_s, rid_t, lsn_t
00084 #endif
00085 
00086 #ifndef LEXIFY_H
00087 #include <lexify.h> // declares sortorder with constants
00088 #endif
00089 
00090 #ifndef NBOX_H
00091 #include <nbox.h>   // key_info_t contains nbox_t
00092 #endif /* NBOX_H */
00093 
00094 #ifndef SORT_S_H
00095 #include <sort_s.h> // declares key_info_t
00096 #endif
00097 
00098 /* DOXYGEN Documentation : */
00099 
00100 /**\addtogroup LOGSPACE 
00101  *
00102  * Updates performed by transactions are logged so that
00103  * the can be rolled back (in the event of a transaction abort)
00104  * or restored (in the event of a crash).  Both the old and new values
00105  * of an updated location are logged.  This allows a steal, no-force
00106  * buffer management policy, which means the buffer manager is free
00107  * to write dirty pages to disk at any time and yet does not have
00108  * to write dirty pages for a a transaction to commit.
00109  *
00110  * The log is stored in a set of Unix files, all in the same directory,
00111  * whose path is determined by a run-time option.
00112  * The maximum size of the log is also determined by a run-time option.o
00113  * The proper value of the log size depends on
00114  * the expected transaction mix.  More specifically, it depends on the
00115  * age of the oldest (longest running) transaction in the system and
00116  * the amount of log space used by all active transactions. Here are
00117  * some general rules to determine the  amount  of  free  log  space
00118  * available in the system.
00119  * - Log records between the first log
00120  *   record generated by the oldest active transaction and the most
00121  *   recent log record generated by any transaction cannot be thrown
00122  *   away.
00123  * - Log records from a transaction are no longer needed
00124  *   once the transaction has committed or completely aborted and all
00125  *   updates have made it to disk. Aborting a transaction causes log space
00126  *   to be used, so space is reserved for aborting each transaction.
00127  *   Enough log space must be available to commit or abort all active
00128  *   transactions at all times.
00129  * 
00130  * - Only space starting at the beginning of the log can be reused.  
00131  *   This space can be reused if it contains log records only for 
00132  *   transactions meeting the previous rule.
00133  *
00134  * -  All storage manager calls that update records require log space twice
00135  *    the size of the space updated in the record. All calls that create,
00136  *    append, or truncate records require log space equal to the size
00137  *    created, inserted, or deleted. Log records generated by these calls
00138  *    (generally one per call) have an overhead of approximately 50 bytes.
00139  *
00140  * - The amount of log space reserved for aborting a transaction is equal to 
00141  *   the amount of log space generated by the transaction plus a fudge 
00142  *   factor. 
00143  *   (Where btrees are concerned, a structure modification
00144  *   might be necessary on abort, using more space on abort, or might not be
00145  *   necessary on abort where it was done during forward processing, 
00146  *   using less space on abort.)
00147  *
00148  * - The transaction assumes responsiblity for reserving space in the
00149  *   log so that it can abort, should it need to (without leaving an
00150  *   unrecoverable volume).  The transaction and the log cooperate to
00151  *   reserve space for the transaction's aborting.
00152  *
00153  * - When insufficient log space is available for a transaction, the 
00154  *   transaction is (may be, depending on the server) aborted.
00155  *   The storage manager will return an error indication (out of log space)
00156  *   if it is unable to insert a log record into the log due to
00157  *   insufficient space.
00158  *
00159  * Checkpoints are taken periodically by the storage manager in order to 
00160  * free log space and shorten recovery time.  Checkpoints are "fuzzy" 
00161  * and can do not require the system to pause while they are completing.
00162  *
00163  * See the storage manager constructor ss_m::ss_m for more information
00164  * about handling out-of-logspace conditions.
00165  *
00166  */
00167 
00168 /**\addtogroup SSMOPT
00169  *
00170  * These are the run-time options for the storage manager.
00171  *
00172  * -sm_bufpoolsize : 
00173  *      - type: number
00174  *      - description: This is the size of 
00175  *      the buffer pool in Kb.  Must be large enough to hold at least 32 pages,
00176  *      so it depends on the configured page size.
00177  *      - default: none
00178  *      - required?: yes
00179  *
00180  * -sm_hugetlbfs_path
00181  *      - type: string (full absolute path name)
00182  *      - description: Needed only if you configured --with-hugetlbfs.
00183  *      - default: see \ref CONFIGOPT
00184  *      - required?: no
00185  *
00186  * -sm_reformat_log
00187  *      - type: Boolean
00188  *      - description: If "yes", your log will be clobbered and the storage
00189  *      manager will start up with an entirely new log.
00190  *      - default: no
00191  *      - required?: no
00192  *
00193  * -sm_logdir
00194  *      - type: string (relative or absolutee path name)
00195  *      - description: Location of the log files.
00196  *      - default: none
00197  *      - required?: yes
00198  *
00199  * -sm_logbufsize
00200  *      - type: number
00201  *      - description: size of log buffer in KB.
00202  *      Must be greater than or equal to the larger of
00203  *      (4 times the page size, 64 Kb)
00204  *      and less than or equal to
00205  *      128 times the page_size. This is the size of 
00206  *      the log buffer in Kb.
00207  *      - default: 128
00208  *      - required?: no
00209  *
00210  * -sm_logsize
00211  *      - type: number
00212  *      - description: greater than or equal to 8256 
00213  *      This is the maximum size of the log in Kb.  It is a function of
00214  *      the log buffer size, and  the default is the minimum allowable for
00215  *      the default sm_logbufsize.
00216  *      - default: 128
00217  *      - required?: yes
00218  *
00219  * -sm_log_warn
00220  *      - type: number between 0 and 100 (percentage)
00221  *      - description: percentage of log that, when consumed by active
00222  *      transactions, triggers a callback warning of potential inability
00223  *      to roll back.   Should be less than 50.
00224  *      - default: 45
00225  *      - required?: no
00226  *
00227  * -sm_errlog
00228  *      - type: string (relative or absolute path name OR - )
00229  *      - description: Destination for error messages.  If "-" is given,
00230  *      the destination is stderr.
00231  *      - default: \b -
00232  *      - required?: no
00233  *
00234  * -sm_errlog_level
00235  *      - type: string  (one of none|emerg|fatal|internal|error|warning|info|debug)
00236  *      - description: filter.  Message of this priority or higher are issued to
00237  *      the error log; messages with lower priority are not issued.
00238  *      The priorities are listed from high to low. "none" means no logging
00239  *      will happen.
00240  *      - default: error
00241  *      - required?: no
00242  *
00243  * -sm_locktablesize : 
00244  *      - type: number greater than or equal to 64
00245  *      - description: size of lock manager's hash table will be a prime
00246  *      number near and greater than the given number.
00247  *      - default: 64000 (yields a hash table with 65521 buckets)
00248  *      - required?: no
00249  *
00250  * -sm_lock_escalate_to_page_threshold
00251  *      - type: number greater than or equal to 0
00252  *      - description: after acquiring this many record locks on a page, the lock
00253  *      will be escalated to a page lock. A value of 0 disables escalation to a
00254  *      page lock.
00255  *      - default: 5
00256  *      - required?: no
00257  *
00258  * -sm_lock_escalate_to_store_threshold
00259  *      - type: number greater than or equal to 0
00260  *      - description: after acquiring this many page locks on in a store, 
00261  *      the lock will be escalated to a store lock. 
00262  *      A value of 0 disables escalation to a store lock.
00263  *      - default: 25
00264  *      - required?: no
00265  *      
00266  * -sm_lock_escalate_to_volume_threshold
00267  *      - type: number greater than or equal to 0
00268  *      - description: after acquiring this many store locks on in a volume, 
00269  *      the lock will be escalated to a volume lock. 
00270  *      A value of 0 disables escalation to a volume lock.
00271  *      - default: 0
00272  *      - required?: no
00273  *
00274  * -sm_cc_alg
00275  *      - type: string (one of file | page | record | none)
00276  *      - description: default locking granularity for file operations.
00277  *      This can be overridden on a per-transaction basis with
00278  *      ss_m::set_xct_lock_level().
00279  *      - default: record
00280  *      - required?: no
00281  *
00282  * -sm_backgroundflush
00283  *      - type: Boolean
00284  *      - description: Enables background-flushing of volumes.
00285  *      Must be set to "yes" for sm_num_page_writers to have any effect.
00286  *      - default: yes
00287  *      - required?: no
00288  *
00289  * -sm_num_page_writers
00290  *      - type: number
00291  *      - description: greater than or equal to 0; this is the number of
00292  *      background-flushing threads for each volume. If you have 
00293  *      lots of threads, 
00294  *      a huge buffer pool, and few volumes, you should increase this.
00295  *      If sm_backgroundflush is "no", this value is ignored.
00296  *      - default: 2
00297  *      - required?: no
00298  *
00299  * -sm_prefetch
00300  *      - type: Boolean
00301  *      - description: Enables prefetching for scans.
00302  *      - default: no
00303  *      - required?: no
00304  *
00305  * -sm_logging
00306  *      - type: Boolean
00307  *      - description: Allows you to turn off logging for a run of
00308  *      the storage manager. This is only for experimentation, to
00309  *      measure logging overhead in a limited way.
00310  *      Aborts, rollbacks and restart/recovery 
00311  *      do not work without logging.   Independent concurrent
00312  *      transactions using btrees might not work without logging (this is
00313  *      not well-tested).
00314  *      Each time you start the server, you had better start with a
00315  *      clean device or a device that resulted from a clean shutdown
00316  *      of the prior run.
00317  *      - default: yes
00318  *      - required?: no
00319  *
00320  * -sm_lock_caching
00321  *      - type: Boolean
00322  *      - description: Enables caching of transaction locks in transaction.
00323  *      Can be turned off for experimentation. If no, the default is not
00324  *      to cache locks, but any transaction can turn on caching for itself
00325  *      by calling the ss_m method  set_lock_cache_enable(bool enable).
00326  *      - default: yes
00327  *      - required?: no
00328  *
00329  */
00330 
00331 
00332 /**\addtogroup SSMXCT 
00333  * All storage manager operations on data must be done within the scope of
00334  * a transaction (ss_m::begin_xct, ss_m::commit_xct, ss_m::abort_xct,
00335  * ss_m::chain_xct). 
00336  *
00337  * A very few storage manager operations, such as formatting a volume, are
00338  * called outside the scope of a transaction and the storage manager begins
00339  * its own transaction to do the work.
00340  *
00341  * Operations that fail return an error indication and the storage 
00342  * manager assumes that the server will thereafter abort the 
00343  * transaction in which the error occurred, when abort is indicated.
00344  * Abort is indicated when eUSERABORT or eDEADLOCK is returned and 
00345  * when the erver chooses to abort rather than to work around the problem 
00346  * (whatever it might be, such as eRETRY).
00347  *
00348  * The storage manager does not enforce the aborting of any erroneous
00349  * transactions except, possibly, those that are in danger of 
00350  * running out of log space.
00351  * (This is done with the destructor of the prologue used on each call
00352  * to the storage manager, see next paragraph).
00353  *
00354  * It is always the server's responsibility to abort.
00355  * When the storage manager 
00356  * encounters a eLOGSPACEWARN condition (the log hasn't enough
00357  * space \e at \e this \e moment to abort the running transaction,
00358  * assuming a 1:1 ration of rollback-logging overhead to forward-processing
00359  * logging overhead), it does one of two things:
00360  * - passes the error code eLOGSPACEWARN up the call stack back to the server
00361  *   if the storage manager was constructed with no log-space-warning callback
00362  *   argument (see LOG_WARN_CALLBACK_FUNC, ss_m::ss_m).
00363  * - tries to abort a transaction before passing an error code back up
00364  *   the call stack to the server. Choosing a victim transaction to abort
00365  *   is done by the server in its log-space-warning callback function (passed
00366  *   in on ss_m::ss_m, q.v.
00367  *   Only if that callback function returns a non-null victim transaction
00368  *   and returns eUSERABORT does the storage manager abort that victim
00369  *   before returning eUSERABORT up the call stack. Any other
00370  *   error code returned by the callback function is just returned up
00371  *   the call stack.
00372  *
00373  * \section LOCKS Locks 
00374  *
00375  * The storage manager automatically acquires the 
00376  * necessary locks when the data are read or written.
00377  * The locks thus acquired are normally released at the end of a transaction,
00378  * thus, by default, transactions are two-phase and well-formed (degree 3).
00379  *
00380  * \subsection GRAN Lock Granularity
00381  * The fine-grained locks are normally used for records in files, but
00382  * provision is made for using coarser-grained locks.  The transaction
00383  * has a default lock level associated with it,
00384  * which governs the granularity of locks acquired by the storage manager
00385  * on behalf of the transaction.
00386  * The lock manager provides for lock escalation to coarser locks to
00387  * reduce the locking costs.  See \ref SSMLOCK and smlevel_0::concurrency_t. 
00388  *
00389  * Key-value locking is normally used for B+-Trees. (See \ref MOH1.)
00390  * R*-Trees normally use coarse-granularity locking.
00391  * The locking protocol used with an index is determined when the
00392  * index is created.  A transaction may acquire coarse (index-level)
00393  * locks with explicit calls to the lock manager, but by default, 
00394  * the granularity/level/protocol associated with the index is used.
00395  * See smlevel_0::concurrency_t. 
00396  *
00397  * \section DISTXCT Distributed Transactions
00398  * Storage manager transactions may be used as "threads" (to 
00399  * overload this term) of distributed transactions.  
00400  * Coordination of 2-phase commit must be done externally,
00401  * but the storage manager supports preparing the (local) transaction "thread" 
00402  * for two-phase commit, and it will log the necessary 
00403  * data for recovering in-doubt transactions.
00404  *
00405  * \section ATTACH Threads and Transactions
00406  * Transactions are not tied to storage manager threads (smthread_t, not
00407  * to be confused with a local "thread" of a distributed transaction) in any 
00408  * way other than that a transaction must be \e attached to a
00409  * thread while any storage manager work is being done on behalf of 
00410  * that transaction.   This is how the storage manager knows \e which
00411  * transaction is to acquire the locks and latches, etc.
00412  * But a thread can attach and detach from transactions at will, so
00413  * work may be performed by different threads each time the storage
00414  * manager is called on behalf of a given transaction; this allows the
00415  * server to keep a pool of threads to perform work and allows them to
00416  * perform work on behalf of any active transaction.
00417  *
00418  * \warning
00419  * While there are limited circumstances in which multiple threads can be
00420  * attached to the same transaction \e concurrently and perform storage 
00421  * manager operations on behalf of that transaction concurrently,
00422  * which is a hold-over from the original storage manager, this 
00423  * functionality will be deprecated soon.  The reason for this being
00424  * removed is that it is extremely difficult to handle errors internally
00425  * when multiple threads are attached to a transaction because 
00426  * partial rollback is impossible in the absence of multiple log streams
00427  * for a transaction.
00428  *
00429  * Under no circumstances may a thread attach to more than one transaction
00430  * at a time.
00431  *
00432  *
00433  * \section EXOTICA Exotica
00434  * The storage manager also provides 
00435  * - partial rollback (ss_m::save_work and ss_m::rollback_work), 
00436  *   which undoes actions but does not release locks,
00437  * - transaction chaining (ss_m::chain_xct), which commits, but retains locks
00438  *   and gives them to a new transaction,
00439  * - lock release (sm_quark_t, ss_m::unlock), allowing less-than-3-degree
00440  *   transactions.
00441  *
00442  *  To reduce the cost (particularly in logging) of loading databases,
00443  *  the storage manager provides for unlogged loading of stores.
00444  *  See \ref SSMSTORE.
00445  */
00446 
00447 
00448 
00449 /** \file sm_vas.h
00450  * \details
00451  * This is the include file that all value-added servers should
00452  * include to get the Shore Storage Manager API.
00453  *
00454  */
00455 /********************************************************************/
00456 
00457 class page_p;
00458 class xct_t;
00459 class device_m;
00460 class vec_t;
00461 class log_m;
00462 class lock_m;
00463 class btree_m;
00464 class file_m;
00465 class pool_m;
00466 class dir_m;
00467 class chkpt_m;
00468 class lid_m; 
00469 class sm_stats_cache_t;
00470 class option_group_t;
00471 class option_t;
00472 class prologue_rc_t;
00473 class rtree_m;
00474 class sort_stream_i;
00475 
00476 /**\addtogroup SSMSP  
00477  * A transaction may perform a partial rollback using savepoints.
00478  * The transaction populates a savepoint by calling ss_m::save_work,
00479  * then it may roll back to that point with ss_m::rollback_work.
00480  * Locks acquired between the save_work and rollback_work are \e not
00481  * released.
00482  */
00483 
00484 /**\brief A point to which a transaction can roll back.
00485  * \ingroup SSMSP
00486  *\details
00487  * A transaction an do partial rollbacks with
00488  * save_work  and rollback_work, which use this class to determine
00489  * how far to roll back.
00490  * It is nothing more than a log sequence number for the work done
00491  * to the point when save_work is called.
00492  */
00493 class sm_save_point_t : public lsn_t {
00494 public:
00495     NORET            sm_save_point_t(): _tid(0,0) {};
00496     friend ostream& operator<<(ostream& o, const sm_save_point_t& p) {
00497         return o << p._tid << ':' << (const lsn_t&) p;
00498     }
00499     friend istream& operator>>(istream& i, sm_save_point_t& p) {
00500         char ch;
00501         return i >> p._tid >> ch >> (lsn_t&) p;
00502     }
00503     tid_t            tid() const { return _tid; }
00504 private:
00505     friend class ss_m;
00506     tid_t            _tid;
00507 };
00508 
00509 /**\addtogroup SSMQK  
00510  * A quark is a marker in the transaction's list of acquired locks.
00511  * One may release all short-duration locks acquired since the quark was inserted 
00512  * into the list via sm_quark_t::open().
00513  * The lock manager modifies the locks acquired inside a quark
00514  * so that non-extent locks are no longer than short-duration.
00515  *
00516  * This is for experimentation only, and is \e not well-tested or supported.
00517  *
00518  * How used:
00519  * \code
00520  * sm_quark_t *q = new sm_quark_t;
00521  * q->open();  // inserts marker in transaction's list.
00522  * ...
00523  * q->close(); // frees short-duration locks to the marker.
00524  * delete q;
00525  * \endcode
00526  *
00527  * Deleting the quark without closing it causes it to be closed.
00528  * Quarks may \e not be used with multi-threaded transactions.
00529  *
00530  * Note that if a transaction has multiple threads attached when
00531  * a thread opens a quark, there is no way to determine where the
00532  * quark takes effect, and since it affects the locks acquired by
00533  * all threads of the transaction, it must be used very carefully
00534  * where multiply-threaded transactions are concerned.
00535  */
00536 
00537 /**\brief List of locks acquired by a transaction since
00538  * the quark was "opened".   
00539  * \ingroup SSMQK
00540  * \details
00541  * When a quark is closed (by calling close()), 
00542  * the release_locks parameter indicates if all short-duration read
00543  * locks acquired during the quark should be released.
00544  * \note Quarks are an experimental feature for use 
00545  * as a building block for a more general nested-transaction facility.
00546  *
00547  * \internal See lock_x.h
00548  */
00549 class sm_quark_t {
00550 public:
00551     NORET            sm_quark_t() {}
00552     NORET            ~sm_quark_t();
00553 
00554     rc_t            open();
00555     rc_t            close(bool release=true);
00556 
00557     tid_t            tid()const { return _tid; }
00558     operator         bool()const { return (_tid != tid_t::null); }
00559     friend ostream& operator<<(ostream& o, const sm_quark_t& q);
00560     friend istream& operator>>(istream& i, sm_quark_t& q);
00561 
00562 private:
00563     friend class ss_m;
00564     tid_t            _tid;
00565 
00566     // disable
00567     sm_quark_t(const sm_quark_t&);
00568     sm_quark_t& operator=(const sm_quark_t&);
00569 
00570 };
00571 
00572 class sm_store_info_t;
00573 class log_entry;
00574 class coordinator;
00575 class tape_t;
00576 /**\brief \b This \b is \b the \b SHORE \b Storage \b Manager \b API.
00577  *\details
00578  * Most of the API for using the storage manager is through this
00579  * interface class.
00580  */
00581 class ss_m : public smlevel_top 
00582 {
00583     friend class pin_i;
00584     friend class sort_stream_i;
00585     friend class prologue_rc_t;
00586     friend class log_entry;
00587     friend class coordinator;
00588     friend class tape_t;
00589 public:
00590 
00591     typedef smlevel_0::LOG_WARN_CALLBACK_FUNC LOG_WARN_CALLBACK_FUNC;
00592     typedef smlevel_0::LOG_ARCHIVED_CALLBACK_FUNC LOG_ARCHIVED_CALLBACK_FUNC;
00593     typedef smlevel_0::ndx_t ndx_t;
00594     typedef smlevel_0::concurrency_t concurrency_t;
00595     typedef smlevel_1::xct_state_t xct_state_t;
00596 
00597     typedef sm_store_property_t store_property_t;
00598 
00599 #ifdef COMMENT
00600     //
00601     // Below is most of the interface for the SHORE Storage Manager.
00602     // The rest is located in pin.h, scan.h, and smthread.h
00603     //
00604 
00605     //
00606     // TEMPORARY FILES/INDEXES
00607     //
00608     // When a file or index is created there is a tmp_flag parameter
00609     // that when true indicates that the file is temporary.
00610     // Operations on a temporary file are not logged and the
00611     // file will be gone the next time the volume is mounted.
00612     //
00613     // TODO: IMPLEMENTATION NOTE on Temporary Files/Indexes:
00614     //        Temp files cannot be trusted after transaction abort.
00615     //            They should be marked for removal.
00616     //
00617     // CODE STRUCTURE:
00618     //    Almost all ss_m functions begin by creating a prologue object
00619     //    whose constructor and descructor check for many common errors.
00620     //    In addition most ss_m::OP() functions now call an ss_m::_OP()
00621     //    function to do the real work.  The ss_m::OP functions should
00622     //    not be called by other ss_m functions, instead the corresponding
00623     //    ss_m::_OP function should be used.
00624     //
00625 
00626 #endif /* COMMENT */
00627 
00628   public:
00629     /**\brief Add storage manager options to the given options group.
00630      *\ingroup SSMINIT
00631      *\details
00632      * @param[in] grp The caller's option group, to which the
00633      * storage manager's options will be added for processing soon.
00634      *
00635      * Before the ss_m constructor can be called, setup_options
00636      * \b must be called.  This will install the storage manager's options and
00637      * initialize any that are not required.
00638      * Once all required options have been set, an ss_m can be constructed.
00639      *
00640      *\note This is not thread-safe.  The application (server) must prevent
00641      * concurrent calls to setup_options.
00642      */
00643     static rc_t setup_options(option_group_t* grp);
00644 
00645     /**\brief  Initialize the storage manager.
00646      * \ingroup SSMINIT
00647      * \details
00648      * @param[in] warn   A callback function. This is called 
00649      * when/if the log is in danger of becoming "too full".
00650      * @param[in] get   A callback function. This is called 
00651      * when the storage manager needs an archived log file to be restored.
00652      *
00653      * When an ss_m object is created, the storage manager initializes itself
00654      * and,
00655      * if the sthreads package has not already been initialized by virtue
00656      * of an sthread_t running, the sthreads package is initialized now.
00657      *
00658      * The log is read and recovery is performed (\ref MHLPS), 
00659      * and control returns to
00660      * the caller, after which time
00661      * storage manager threads (instances of smthread_t) may be constructed and
00662      * storage manager may be used.
00663      *
00664      * The storage manager is used by invoking its static methods.  
00665      * You may use them as follows:
00666      * \code
00667      * ss_m *UNIQ = new ss_m();
00668      *
00669      * W_DO(UNIQ->mount_dev(...))
00670      *     // or
00671      * W_DO(ss_m::mount_dev(...))
00672      * \endcode
00673      * ).
00674      *
00675      * Only one ss_m object may be extant at any time. If you try
00676      * to create another while the one exists, a fatal error will occur
00677      * (your program will choke with a message about your mistake).
00678      *
00679      * The callback argument given to the storage manager constructor
00680      * is called when the storage manager determines that it is in danger
00681      * of running out of log space.  Heuristics are used to guess when
00682      * this is the case.  
00683      *
00684      * If the function \a warn archives and removes log files, the function
00685      * \a get must be provided to restore those log files when the
00686      * storage manager needs them.
00687      *
00688      * For details and examples, see  \ref smlevel_0::LOG_WARN_CALLBACK_FUNC, 
00689      *  \ref smlevel_0::LOG_ARCHIVED_CALLBACK_FUNC, and 
00690      *  \ref LOGSPACE.
00691      */
00692     ss_m(LOG_WARN_CALLBACK_FUNC warn=NULL, LOG_ARCHIVED_CALLBACK_FUNC get=NULL);
00693 
00694     /**\brief  Shut down the storage manager.
00695      * \ingroup SSMINIT
00696      * \details
00697      * When the storage manager object is deleted, it shuts down.
00698      * Thereafter it is not usable until another ss_m object is 
00699      * constructed.
00700      */
00701     ~ss_m();
00702 
00703     /**\brief Cause the storage manager's shutting down do be done cleanly 
00704      * or to simulate a crash.
00705      * \ingroup SSMINIT
00706      * \details
00707      * @param[in] clean   True means shut down gracefully, false means simulate a crash.
00708      *
00709      * When the storage manager's destructor is called
00710      * the buffer pool is flushed to disk, unless this method is called 
00711      * with \a clean == \e false.
00712      *
00713      * \note If this method is used, it
00714      * must be called after the storage manager is 
00715      * constructed if it is to take effect. Each time the storage
00716      * manager is constructed, the state associated with this is set
00717      * to \e true, i.e., "shut down properly".
00718      *
00719      * \note This method is not thread-safe, only one thread should use this
00720      * at any time, presumably just before shutting down.
00721      */
00722     static void         set_shutdown_flag(bool clean);
00723 
00724     /**\brief Notify storage manager when a log file was archived by a
00725      * LOG_WARN_CALLBACK_FUNC.
00726      *
00727      * The arguments:
00728      * @param[in] logfile   Character string name of file archived.
00729      */
00730     static rc_t         log_file_was_archived(const char * logfile);
00731 
00732 private:
00733     void                _construct_once(LOG_WARN_CALLBACK_FUNC x=NULL,
00734                                            LOG_ARCHIVED_CALLBACK_FUNC y=NULL);
00735     void                _destruct_once();
00736 
00737 
00738 public:
00739     /**\addtogroup SSMXCT
00740      *
00741      * All work performed on behalf of a transaction must occur while that
00742      * transaction is "attached" to the thread that performs the work.
00743      * Creating a transaction attaches it to the thread that creates the transaction. 
00744      * The thread may detach from the transaction and attach to another.
00745      * Multiple threads may attach to a single transaction and do work in certain circumstances.   See \ref SSMMULTIXCT
00746      *
00747      * 
00748      */
00749     /**\brief Begin a transaction 
00750      *\ingroup SSMXCT
00751      * @param[in] timeout   Optional, controls blocking behavior.
00752      * \details
00753      *
00754      * Start a new transaction and "attach" it to this thread. 
00755      * No running transaction may be attached to this thread.
00756      * 
00757      * Storage manager methods that must block (e.g., to acquire a lock) 
00758      * will use the timeout given.  
00759      * The default timeout is the one associated with this thread.
00760      *
00761      * \sa timeout_in_ms
00762      */
00763     static rc_t           begin_xct(
00764         timeout_in_ms            timeout = WAIT_SPECIFIED_BY_THREAD);
00765 
00766     /**\brief Begin an instrumented transaction. 
00767      *\ingroup SSMXCT
00768      * @param[in] stats   Pointer to an allocated statistics-holding structure.
00769      * @param[in] timeout   Optional, controls blocking behavior.
00770      * \details
00771      * No running transaction may be already attached to this thread.
00772      * A new transaction is started and attached to the running thread.
00773      *
00774      * The transaction will be instrumented.
00775      * This structure is updated by the storage manager whenever a thread
00776      * detaches from this transaction.  The activity recorded during
00777      * the time the thread is attached to the transcation will be stored in
00778      * the per-transaction statistics.
00779      * \attention It is the client's 
00780      * responsibility to delete the statistics-holding structure.
00781      * 
00782      * Storage manager methods that must block (e.g., to acquire a lock) 
00783      * will use the timeout given.  
00784      * The default timeout is the one associated with this thread.
00785      *
00786      * \sa timeout_in_ms
00787      */
00788     static rc_t           begin_xct(
00789         sm_stats_info_t*         stats,  // allocated by caller
00790         timeout_in_ms            timeout = WAIT_SPECIFIED_BY_THREAD);
00791 
00792     /**\brief Begin a transaction and return the transaction id.
00793      *\ingroup SSMXCT
00794      * @param[out] tid      Transaction id of new transaction.
00795      * @param[in] timeout   Optional, controls blocking behavior.
00796      * \details
00797      *
00798      * No running transaction may be attached to this thread.
00799      * 
00800      * Storage manager methods that must block (e.g., to acquire a lock) 
00801      * will use the timeout given.  
00802      * The default timeout is the one associated with this thread.
00803      *
00804      * \sa timeout_in_ms
00805      */
00806     static rc_t           begin_xct(
00807         tid_t&                   tid,
00808         timeout_in_ms            timeout = WAIT_SPECIFIED_BY_THREAD);
00809 
00810     /**\addtogroup SSM2PC  
00811      * The storage manager contains support for externally-coordinated
00812      * transactions that use
00813      * two-phase-commit with presumed abort.
00814      * The server must provide the coordination and the coordinator is
00815      * assumed to have its own stable storage, and it is assumed to recover
00816      * from failures in a "short time", the precise meaning of which is given below.
00817      * A prepared transaction, like an active transaction,
00818      * consumes log space and holds locks.
00819      * Even if a prepared transaction does not hold locks needed by 
00820      * other transactions, it consumes resources in a way that can interfere 
00821      * with other transactions.
00822      * If a prepared transaction remains in the system for a long time 
00823      * while other transactions are running, eventually the storage 
00824      * manager needs the log space used (reserved) by the prepared transaction.
00825      * A coordinator must resolve its prepared transactions
00826      * before the storage manager effectively runs out of 
00827      * log space for other transactions in the system.
00828      * The amount of time involved is a function of the size of the log
00829      * and of the demands of the other transactions in the system.
00830      *
00831      * For the purpose of this discussion, the portion of a global 
00832      * transaction that involves a single Shore Storage Manager transaction is 
00833      * called a thread of the global transaction.
00834      *
00835      * A Shore transaction participates as a thread of a global transaction
00836      * as follows:
00837      - Start a storage-manager transaction with ss_m::begin_xct.
00838      - Acquire a global transaction identifier from the coordinator.
00839      - Indicate to the storage manager that this transaction is a 
00840      thread of a global transaction, and associate the global transaction 
00841      identifier with this thread by calling ss_m::enter_2pc.
00842      - Associate a coordinator with the transaction for recovery 
00843      purposes, by calling ss_m::set_coordinator.
00844      - Prepare the thread of the transaction and get the storage manager's 
00845      vote with ss_m::prepare_xct.  
00846      It is an error to commit a global transaction thread without first 
00847      preparing it.  It is an error to do anything else 
00848      in a transaction after it is prepared, except to end 
00849      the transaction or retry the prepare (to get the vote again).
00850      - Convey the vote to the coordinator, and determine the transaction's 
00851      fate from the coordinator.
00852      - End the thread with ss_m::commit_xct or ss_m::abort_xct.
00853      *
00854      * The storage manager 
00855      * logs the minimal information required to effect a vote of the
00856      * transaction threads that are storage manager transactions,
00857      * and to recover such in-doubt transactions after restart.
00858      * Thus, after a crash/restart, the server may query the storage manager
00859      * about in-doubt (prepared) transactions with ss_m::query_prepared_xct,
00860      * which tells the caller the number and global transaction IDs associated
00861      * with prepared transactions.
00862      * Using this, the server contacts the coordinator and resumes the
00863      * voting.
00864      * The server may find the local transaction IDs and use ss_m::tid_to_xct
00865      * to attach these transactions  and to resolve them.
00866      * 
00867      * Commit and abort of read-only transactions are the same,
00868      * as these transactions have no log entries.  Preparing read-only transactions
00869      * causes them to commit/abort and the vote returned is vote_readonly.
00870      * Once this vote is communicated to the coordinator and the coordinator
00871      * records it on stable storage, there is no need to involve this thread in
00872      * any further processing.  For this reason,
00873      * read-only transactions do not appear as prepared transactions at
00874      * recovery time.
00875      * 
00876      */
00877 
00878     /**\brief Make the attached transaction a thread of a distributed transaction.
00879      *\ingroup SSM2PC
00880      *
00881      * @param[in] gtid    Global transaction ID to associate with this transaction.  This will be logged when the transaction is prepared.
00882      * 
00883      * \note This can be called at most once for a given transaction.
00884      * The transaction must be attached to the calling thread.
00885      * No other threads may be attached to the transaction.
00886      */
00887     static rc_t           enter_2pc(const gtid_t &gtid); 
00888     /**\brief Assign a coordinator handle to this distributed transaction.
00889      *\ingroup SSM2PC
00890      * @param[in] h      Handle of the coordinator.  Not interpreted by
00891      * the storage manager.
00892      *
00893      * The storage manager associates this server handle with the transaction 
00894      * so that when the transaction is prepared, this information is 
00895      * written to the log. Upon recovery, if this transaction is still in doubt,
00896      * the value-added server can query the 
00897      * storage manager for in-doubt transactions, get their server handles,
00898      * and resolve the transactions.
00899      * See query_prepared_xct and recover_2pc.
00900      */
00901     static rc_t           set_coordinator(const server_handle_t &h); 
00902 
00903     /**\brief Prepare a thread of a distributed transaction.
00904      *\ingroup SSM2PC
00905      * @param[in] stats     Pointer to an allocated statistics-holding 
00906      *                      structure.
00907      * @param[out] vote     This thread's vote.
00908      *
00909      * The storage manager will prepare the attached transaction (a thread
00910      * of a distributed transaction) for commit.
00911      * If this transaction has performed no logged updates, the 
00912      * vote returned will be vote_readonly.
00913      * If this transaction can commit, the vote returned will be vote_commit.
00914      * If an error occurs during the prepare, the vote will be vote_abort.
00915      *
00916      * If the transaction is being instrumented, the 
00917      * statistics-holding structure will be returned to the caller, 
00918      * and the caller is responsible for its deallocation.
00919      */
00920     static rc_t           prepare_xct(
00921                             sm_stats_info_t*&         stats, 
00922                             vote_t&                   vote); 
00923 
00924     /**\brief Prepare a thread of a distributed transaction.
00925      *\ingroup SSM2PC
00926      * @param[out] vote     This thread's vote. See \ref w_base_t::vote_t.
00927      *
00928      * The storage manager will prepare the attached transaction (a thread
00929      * of a distributed transaction) for commit.
00930      * If this transaction has performed no logged updates, the 
00931      * vote returned will be vote_readonly.
00932      * If this transaction can commit, the vote returned will be vote_commit.
00933      * If an error occurs during the prepare, the vote will be vote_abort.
00934      */
00935     static rc_t           prepare_xct(vote_t &vote); 
00936 
00937     /**\brief Force the transaction to vote "read-only" in a two-phase commit. 
00938      *\ingroup SSM2PC
00939      * \details
00940      * This will override the storage manager's determination of 
00941      * whether this thread of a distributed transaction is read-only, which is
00942      * based on whether the local transaction thread logged anything. This
00943      * method may be useful if the local transaction rolled back to 
00944      * a savepoint.
00945      * See  \ref w_base_t::vote_t.
00946      */
00947     static rc_t           force_vote_readonly(); 
00948 
00949     /**\brief Given a global transaction id, find the local prepared 
00950      * transaction associated with it. 
00951      *\ingroup SSM2PC
00952      * @param[in] gtid     A global transaction ID (an opaque quantity 
00953      * to the storage manager).
00954      * @param[in] mayblock Not used.
00955      * @param[out] local   Return the transaction ID of the prepared 
00956      * SM transaction.
00957      * \details
00958      * Searches the transaction list for a prepared transaction with the given
00959      * global transaction id. If found, it returns a reference to the 
00960      * local transaction.  The transaction is attached to the running
00961      * thread before it is returned.
00962      */
00963     static rc_t           recover_2pc(const gtid_t & gtid,
00964         bool                      mayblock,
00965         tid_t &                   local
00966         );
00967 
00968     /**\brief  Return the number of prepared transactions.
00969      *\ingroup SSM2PC
00970      * @param[out] numtids   The number of in-doubt transactions.
00971      * \details
00972      * Used by a server at start-up, after recovery, to find out if
00973      * there are any in-doubt transactions.  If so, the server must
00974      * use the second form of query_prepared_xct to find the global
00975      * transaction IDs of these in-doubt transactions.
00976      */
00977     static rc_t           query_prepared_xct(int &numtids);
00978 
00979     /**\brief  Return the global transaction IDs of in-doubt transactions. 
00980      *\ingroup SSM2PC
00981      * @param[in] numtids   The number of global transaction ids in the list.
00982      * @param[in] l   The caller-provided list into which to write the 
00983      * global transaction-ids.
00984      * \details
00985      * Used by a server at start-up, after recovery, to find out the
00986      * global transaction IDs of the prepared transactions.  The storage
00987      * manager fills in the first numtids entries of the pre-allocated list.
00988      * The server may have first called the first form of query_prepared_xct
00989      * to find out how many such transactions there are after recovery.
00990      *
00991      * \attention Read-only transactions 
00992      * do not appear as in-doubt transactions. Because they did not
00993      * generate any log records, they will not be "discovered" by analysis.
00994      * The server must determine that any thread of a global transaction that
00995      * does not appear to be in doubt was a read-only thread or
00996      * it never prepared and thus has been aborted.
00997      * Read-only transactions that were prepared would have voted read-only,
00998      * and if the coordinator recorded that vote on stable storage, it
00999      * should not be concerned with these transaction threads any further.
01000      * If the coordinator does not have this information recorded, the
01001      * transaction thread could have been an aborted non-read-only transaction,
01002      * so the coordinator must, in this case, presume that the thread aborted
01003      * and thus make the global transaction abort.
01004      */
01005     static rc_t           query_prepared_xct(int numtids, gtid_t l[]);
01006 
01007 
01008     /**\brief Commit a transaction.
01009      *\ingroup SSMXCT
01010      * @param[in] lazy   Optional, controls flushing of log.
01011      * @param[out] plastlsn   If non-null, this is a pointer to a
01012      *                    log sequence number into which the storage
01013      *                    manager writes the that of the last log record
01014      *                    inserted for this transaction.
01015      * \details
01016      *
01017      * Commit the attached transaction and detach it, destroy it.
01018      * If \a lazy is true, the log is not synced.  This means that
01019      * recovery of this transaction might not be possible.
01020      */
01021     static rc_t           commit_xct(
01022                                      bool   lazy = false,
01023                                      lsn_t* plastlsn=NULL);
01024 
01025     /**\brief Commit an instrumented transaction and get its statistics.
01026      *\ingroup SSMXCT
01027      * @param[out] stats   Get a copy of the statistics for this transaction.
01028      * @param[in] lazy   Optional, controls flushing of log.
01029      * @param[out] plastlsn   If non-null, this is a pointer to a
01030      *                    log sequence number into which the storage
01031      *                    manager writes the that of the last log record
01032      *                    inserted for this transaction.
01033      * \details
01034      *
01035      * Commit the attached transaction and detach it, destroy it.
01036      * If \a lazy is true, the log is not synced.  This means that
01037      * recovery of this transaction might not be possible.
01038      */
01039     static rc_t            commit_xct(
01040                                     sm_stats_info_t*& stats, 
01041                                     bool              lazy = false,
01042                                     lsn_t*            plastlsn=NULL);
01043 
01044     /**\brief Commit an instrumented transaction and start a new one.
01045      *\ingroup SSMXCT
01046      * @param[out] stats   Get a copy of the statistics for the first transaction.
01047      * @param[in] lazy   Optional, controls flushing of log.
01048      * \details
01049      *
01050      * Commit the attached transaction and detach it, destroy it.
01051      * Start a new transaction and attach it to this thread.
01052      * \note \e The \e new 
01053      * \e transaction \e inherits \e the \e locks \e of \e the \e old 
01054      * \e transaction.
01055      *
01056      * If \a lazy is true, the log is not synced.  This means that
01057      * recovery of this transaction might not be possible.
01058      */
01059     static rc_t            chain_xct(
01060         sm_stats_info_t*&         stats,    /* in w/new, out w/old */
01061         bool                      lazy = false);  
01062 
01063     /**\brief Commit a transaction and start a new one, inheriting locks.
01064      *\ingroup SSMXCT
01065      * @param[in] lazy   Optional, controls flushing of log.
01066      * \details
01067      *
01068      * Commit the attached transaction and detach it, destroy it.
01069      * Start a new transaction and attach it to this thread.
01070      * \note \e The \e new 
01071      * \e transaction \e inherits \e the \e locks \e of \e the \e old 
01072      * \e transaction.
01073      *
01074      * If \a lazy is true, the log is not synced.  This means that
01075      * recovery of the committed transaction might not be possible.
01076      */
01077     static rc_t            chain_xct(bool lazy = false);  
01078 
01079 
01080     /**\brief Commit a group of transactions.
01081      *\ingroup SSMXCT
01082      * @param[in] list      List of pointers to transactions to commit.
01083      * @param[in] listlen   Number of transactions in the list.
01084      * \details
01085      *
01086      * Commit each transaction in the list as an all-or-none affair.
01087      * Any transaction that is attached to the thread will be
01088   * detached before anything is done.
01089   *
01090   * The purpose of this method is to allow multiple transactions 
01091   * to commit together with a single log record. No voting takes place.
01092   * The entire list of transaction identifiers must fit in a single
01093   * log record. If it does not, a descriptive error will be returned and no 
01094   * transaction will be committed. In this case, the server has the
01095   * option to singly commit each transaction.
01096   *
01097   * If any other error occurs during one of the commits, the error
01098   * will be returned to the caller and none of the transactions
01099   * will be committed; they \b must be aborted thereafter.
01100   *
01101   * This is not intended to be used with transactions that are
01102   * participating in two-phase commit, but if
01103   * one of the transactions is participating in two-phase commit,
01104   * they all must be and they all must be prepared.  
01105   *
01106   * Chaining and lazy commit are not offered with this form of commit.
01107   * If a transaction in the list is instrumented, its statistics
01108   * resources will be deleted upon successful commit.
01109   *
01110   * \note 
01111   * By taking a list of transaction pointers, this avoids a the tid_to_xct lookup 
01112   * for each transaction, but the server must regard the transaction pointers as
01113   * invalid after this method returns.
01114   * The transactions, once committed, do not exist anymore. 
01115   * If an error is returned, the server has to re-verify the transaction pointers 
01116   * by using ss_m::tid_to_xct from a separate list of transaction ids to determine
01117   * which transactions are extant.
01118      */
01119     static rc_t            commit_xct_group(
01120   xct_t *               list[],
01121   int                   listlen);
01122 
01123     /**\brief Abort an instrumented transaction and get its statistics.
01124      *\ingroup SSMXCT
01125      * @param[out] stats   Get a copy of the statistics for this transaction.
01126      * \details
01127      *
01128      * Abort the attached transaction and detach it, destroy it.
01129      */
01130     static rc_t            abort_xct(sm_stats_info_t*&  stats);
01131     /**\brief Abort a transaction.
01132      *\ingroup SSMXCT
01133      * \details
01134      *
01135      * Abort the attached transaction and detach it, destroy it.
01136      */
01137     static rc_t            abort_xct();
01138 
01139     /**\brief Populate a save point.
01140      *\ingroup SSMSP
01141      * @param[out] sp   An sm_save_point_t owned by the caller.
01142      *\details
01143      * Store in sp the needed information to be able to roll back 
01144      * to this point. 
01145      * For use with rollback_work.
01146      * \note Only one thread may be attached to a transaction when this
01147      * is called.
01148      */
01149     static rc_t            save_work(sm_save_point_t& sp);
01150 
01151     /**\brief Roll back to a savepoint.
01152      *\ingroup SSMSP
01153      * @param[in] sp   An sm_save_point_t owned by the caller and
01154      * populated by save_work.
01155      *\details
01156      * Undo everything that was 
01157      * done from the time save_work was called on this savepoint.
01158      * \note Locks are not freed.
01159      *
01160      * \note Only one thread may be attached to a transaction when this
01161      * is called.
01162      */
01163     static rc_t            rollback_work(const sm_save_point_t& sp);
01164 
01165     /**\brief Return the number of transactions in active state.
01166      *\ingroup SSMXCT
01167      * \details
01168      * While this is thread-safe, the moment a value is returned, it could
01169      * be out of date.
01170      * Useful only for debugging.
01171      */
01172     static w_base_t::uint4_t     num_active_xcts();
01173 
01174     /**\brief Attach the given transaction to the currently-running smthread_t.
01175      *\ingroup SSMXCT
01176      * \details
01177      * It is assumed that the currently running thread is an smthread_t.
01178      */
01179     static void           attach_xct(xct_t *x) { me()->attach_xct(x); }
01180 
01181     /**\addtogroup SSMMULTIXCT 
01182      * 
01183      * Certain operations may be performed while more than one
01184      * thread is attached to a transaction (this functionality is
01185      * soon to be deprecated).
01186      * Any number of attached threads may be read-only.
01187      * The kinds of updates that can be made by multiple threads are limited by
01188      * the need to avoid latch-mutex and latch-latch deadlocks. 
01189      *
01190      * There are several reasons for this.
01191      * 1) The multiple threads are not protected from each other by locks.
01192      * 2) Interleaving of top-level actions is not supported with rollback;
01193      * this means that for the duration of a top-level action, a thread needs
01194      * access to the log that excludes all other threads in 
01195      * the same transaction.
01196      *
01197      * The internal logging protocol is this:
01198      * T1: latch page, log update. Logging requires acquiring a mutex
01199      * on the xct's log buffer.
01200      * T2: performing any top-level action, acquires the mutex on the
01201      * xct's log buffer before doing the action (latching the page).
01202      *
01203      * Thus, anything involving top-level actions is suspect.  B-trees
01204      * use top-level actions, as does file-page allocation, and creation/
01205      * destruction of stores (files, indexes).  Thus, just about
01206      * any kind of concurrent updates on the same page
01207      * in the same transaction is problematic, and just about any update
01208      * can result in latching extent-map or store-map pages.
01209      * This activity could be disallowed by enforcing a strict 
01210      * rule that at most  one update operation can be going on 
01211      * in a transaction at any time, however this is too restrictive.
01212      *
01213      * Multiple updating threads can
01214      * work \b if \b the \b data \b are \b partitioned by volume.
01215      * So a well-behaved server may use multiple-threaded transactions
01216      * to do updates as long as the updates are on different \b volumes.
01217      * It might also allow read-only transaction threads to be
01218      * concurrent with a single updating thread.
01219      *
01220      * Savepoints and partial rollback may \e not be used with 
01221      * multi-threaded transactions. This is not enforced by the storage
01222      * manager; it is poor behavior on the part of a server.
01223      * For example, the behavior of the following is undefined:
01224      * - thread 1: attach, read,      read,   read, ...
01225      * - thread 2: attach, save work, update, rollback
01226      * If the two threads are reading and possibly updating the same 
01227      * data, the results are timing-dependent and could produce a latch-
01228      * latch or latch-mutex deadlock.
01229      *
01230      * Ongoing research at DIAS is investigating ways to extend the usefulness
01231      * of parallelism within a transaction (multi-threaded transactions).
01232      * Current thoughts about this are for servers to coordinate multiple 
01233      * transactions using two-phase commit or an optimized version
01234      * of commit and abort for groups of local transactions.
01235      */
01236 
01237     /**\brief Detach any attached from the currently-running smthread_t.
01238      *\ingroup SSMXCT
01239      * \details
01240      * Sever the connection between the running thread and the transaction.
01241      * This allow the running thread to attach a different 
01242      * transaction and to perform work in its behalf.
01243      */
01244     static void           detach_xct() { xct_t *x = me()->xct();
01245                                         if(x) me()->detach_xct(x); }
01246 
01247     /**\brief Get the transaction structure for a given a transaction id.
01248      *\ingroup SSMXCT
01249      * @param[in] tid   Transaction ID.
01250      *\details
01251      * Return a pointer to the storage manager's transaction structure.
01252      * Can be used with detach_xct and attach_xct.
01253      */
01254     static xct_t*          tid_to_xct(const tid_t& tid);
01255     /**\brief Get the transaction ID for a given a transaction structure.
01256      *\ingroup SSMXCT
01257      * @param[in] x   Pointer to transaction structure.
01258      *\details
01259      * Return the transaction ID for the given transaction.
01260      */
01261     static tid_t           xct_to_tid(const xct_t* x);
01262 
01263     /**\brief Print transaction information to an output stream.
01264      *\ingroup SSMAPIDEBUG
01265      * @param[in] o   Stream to which to write the information.
01266      * \details
01267      * This is for debugging only, and is not thread-safe. 
01268      */
01269     static rc_t            dump_xcts(ostream &o);
01270 
01271     /**\brief Get the transaction state for a given transaction (structure).
01272      *\ingroup SSMXCT
01273      * @param[in] x   Pointer to transaction structure.
01274      * \details
01275      * Returns the state of the transaction (active, prepared). It is
01276      * hard to get the state of an aborted or committed transaction, since
01277      * their structures no longer exist.
01278      */
01279     static xct_state_t     state_xct(const xct_t* x);
01280 
01281     /**\brief Return the amount of log this transaction would consume
01282      * if it rolled back.
01283      *\ingroup SSMXCT
01284      *
01285      * If a transaction aborts with eOUTOFLOGSPACE this function can
01286      * be used in conjunction with xct_reserve_log_space to
01287      * pre-allocate the needed amount of log space before retrying.
01288      */
01289     static smlevel_0::fileoff_t        xct_log_space_needed();
01290 
01291     /**\brief Require the specified amount of log space to be
01292      * available for this transaction before continuing.
01293      *\ingroup SSMXCT
01294      *
01295      * If a transaction risks running out of log space it can
01296      * pre-request some or all of the needed amount before starting in
01297      * order to improve its chances of success. Other new transactions
01298      * will be unable to acquire log space before this request is
01299      * granted (existing ones will be able to commit, unless they also
01300      * run out of space, because that tends to free up log space and
01301      * avoids wasting work).
01302      */
01303     static rc_t            xct_reserve_log_space(fileoff_t amt);
01304     
01305     /**\brief Get the locking granularity for the attached transaction.
01306      * \ingroup SSMLOCK
01307      */
01308     static concurrency_t   xct_lock_level();
01309     /**\brief Set the default locking level for the attached transaction.
01310      * \ingroup SSMLOCK
01311      * \details
01312      * @param[in] l  The level to use for the balance of this transaction.
01313      * Legitimate values are t_cc_record,  t_cc_page,  t_cc_file.
01314      *
01315      * \note Only one thread may be attached to the transaction when this
01316      * is called. If more than one thread is attached, a fatal error
01317      * will ensue.
01318      */
01319     static void            set_xct_lock_level(concurrency_t l);
01320 
01321     /**\brief Collect transaction information in a virtual table.
01322      * \ingroup SSMVTABLE
01323      * \details
01324      * @param[out] v  The virtual table to populate.
01325      * @param[in] names_too  If true, make the 
01326      *            first row of the table a list of the attribute names.
01327      *
01328      * All attribute values will be strings.
01329      * The virtual table v can be printed with its output operator
01330      * operator<< for ostreams.
01331      *
01332      * \attention Not atomic. Can yield stale data. 
01333      */
01334     static rc_t            xct_collect(vtable_t&v, bool names_too=true);
01335 
01336     /**\brief Collect buffer pool information in a virtual table.
01337      * \ingroup SSMVTABLE
01338      * \details
01339      * @param[out] v  The virtual table to populate.
01340      * @param[in] names_too  If true, make the 
01341      *            first row of the table a list of the attribute names.
01342      *
01343      * \attention Be wary of using this with a large buffer pool.
01344      *
01345      * All attribute values will be strings.
01346      * The virtual table v can be printed with its output operator
01347      * operator<< for ostreams.
01348      *
01349      * \attention Not atomic. Can yield stale data. 
01350      */
01351     static rc_t            bp_collect(vtable_t&v, bool names_too=true);
01352 
01353     /**\brief Collect lock table information in a virtual table.
01354      * \ingroup SSMVTABLE
01355      * \details
01356      * @param[out] v  The virtual table to populate.
01357      * @param[in] names_too  If true, make the 
01358      *            first row of the table a list of the attribute names.
01359      *
01360      * All attribute values will be strings.
01361      * The virtual table v can be printed with its output operator
01362      * operator<< for ostreams.
01363      *
01364      * \attention Not atomic. Can yield stale data. 
01365      * Cannot be used in a multi-threaded-transaction context.
01366      */
01367     static rc_t            lock_collect(vtable_t&v, bool names_too=true);
01368 
01369     /**\brief Collect thread information in a virtual table.
01370      * \ingroup SSMVTABLE
01371      * \details
01372      * @param[out] v  The virtual table to populate.
01373      * @param[in] names_too  If true, make the 
01374      *            first row of the table a list of the attribute names.
01375      *
01376      * All attribute values will be strings.
01377      * The virtual table v can be printed with its output operator
01378      * operator<< for ostreams.
01379      *
01380      * \attention Not thread-safe. Can yield stale data. 
01381      */
01382     static rc_t            thread_collect(vtable_t&v, bool names_too=true);
01383 
01384     /**\brief Take a checkpoint.
01385      * \ingroup SSMAPIDEBUG
01386      * \note For debugging only!
01387      *
01388      * Force the storage manager to take a checkpoint.
01389      * Checkpoints are fuzzy : they can be taken while most other
01390      * storage manager activity is happening, even though they have
01391      * to be serialized with respect to each other, and with respect to
01392      * a few other activities.
01393      *
01394      * This is thread-safe.
01395      */
01396     static rc_t            checkpoint();
01397 
01398     /**\brief Force the buffer pool to flush its pages to disk.
01399      * \ingroup SSMAPIDEBUG
01400      * @param[in] invalidate   True means discard pages after flush.
01401      * \note For debugging only!
01402      * \attention Do not call force_buffers with anything pinned.
01403      * You may cause latch-latch deadlocks, as this method has
01404      * to scan the entire buffer pool and possibly EX-latch pages to prevent
01405      * others from updating while it forces to disk.
01406      * Since the page-order is essentially random, we cannot
01407      * preclude latch-latch deadlocks with other threads.
01408      */
01409     static rc_t            force_buffers(bool invalidate = false);
01410 
01411     /**\brief Force the buffer pool to flush the volume header page(s)
01412      * to disk.
01413      * \ingroup SSMAPIDEBUG
01414      * @param[in] vid   ID of the volume of interest
01415      * \note For debugging only!
01416      * \attention Do not call force_vol_hdr_buffers with anything pinned.
01417      * You could cause latch-latch deadlocks, as this method has
01418      * to scan the entire buffer pool and possibly EX-latch some pages.
01419      * Since the page-order is essentially random, we cannot
01420      * preclude latch-latch deadlocks with other threads.
01421      */
01422     static rc_t            force_vol_hdr_buffers( const vid_t&   vid);
01423 
01424     /**\brief Force the buffer pool to flush to disk all pages
01425      * for the given store.
01426      * \ingroup SSMAPIDEBUG
01427      * @param[in] stid   Store whose pages are to be flushed.
01428      * @param[in] invalidate   True means discard the pages after flushing.
01429      * \note For debugging only!
01430      * \attention Do not call force_store_buffers with anything pinned.
01431      * You may cause latch-latch deadlocks, as this method has
01432      * to scan the entire buffer pool and, if invalide==true,
01433      * EX-latch pages to prevent others from updating 
01434      * while it forces to disk.
01435      * Since the page-order is essentially random, we cannot
01436      * preclude latch-latch deadlocks with other threads.
01437      */
01438     static rc_t            force_store_buffers(const stid_t & stid,
01439                                                bool invalidate);
01440 
01441     /**\cond skip 
01442      * Do not document. Very un-thread-safe.
01443      */
01444     static rc_t            dump_buffers(ostream &o);
01445     static rc_t            dump_locks(ostream &o);
01446     static rc_t            dump_locks(); // defaults to std::cout
01447     static rc_t            dump_exts(ostream &o, 
01448         vid_t                    v, 
01449         extnum_t                 start, 
01450         extnum_t                 end);
01451 
01452     static rc_t            dump_stores(ostream &o, 
01453         vid_t                    v, 
01454         int                      start, 
01455         int                      end);
01456 
01457     static rc_t            dump_histo(ostream &o, bool locked);
01458 
01459     static rc_t            snapshot_buffers(
01460         u_int&                 ndirty, 
01461         u_int&                 nclean, 
01462         u_int&                 nfree,
01463         u_int&                 nfixed);
01464     /**\endcond skip */
01465 
01466     /**\brief Get a copy of the statistics from an attached instrumented transaction.
01467      * \ingroup SSMXCT
01468      * \details
01469      * @param[out] stats Returns a copy of the statistics for this transaction.
01470      * @param[in] reset  If true, the statistics for this transaction will be zeroed.
01471      */
01472     static rc_t            gather_xct_stats(
01473         sm_stats_info_t&       stats, 
01474         bool                   reset = false);
01475 
01476     /**\brief Get a copy of the global statistics.
01477      * \ingroup SSMSTATS
01478      * \details
01479      * @param[out] stats A pre-allocated structure.
01480      */
01481     static rc_t            gather_stats(
01482         sm_stats_info_t&       stats
01483         );
01484 
01485     /**\brief Get a copy of configuration-dependent information.
01486      * \ingroup OPT
01487      * \details
01488      * @param[out] info A pre-allocated structure.
01489      */
01490     static rc_t            config_info(sm_config_info_t& info);
01491 
01492     /**\brief Set sleep time before I/O operations.
01493      * \ingroup SSMVOL
01494      * \details
01495      * This method sets a milli_sec delay to occur before 
01496      * each disk read/write operation.  This is for debugging.
01497      * It is useful in discovering thread sync bugs.
01498      * This delay applies to all threads.
01499     */
01500     static rc_t            set_disk_delay(u_int milli_sec);
01501 
01502     /**\cond skip */
01503     // TODO : document crash testing facilities
01504     /**\brief Simulate a crash
01505      * \details
01506      * This method tells the log manager to start generating corrupted
01507      * log records.  This will make it appear that a crash occurred
01508      * at that point in the log.  A call to this method should be
01509      * followed immediately by a dirty shutdown of the ssm.
01510      */
01511     static rc_t            start_log_corruption();
01512 
01513     /* for smsh/debugging:   
01514      * log an arbitrary message */
01515     static rc_t            log_message(const char * const msg);
01516     /**\endcond skip */
01517 
01518     // Forces a log flush
01519     static rc_t            sync_log(bool block=true);
01520     static rc_t            flush_until(lsn_t& anlsn, bool block=true);
01521 
01522     // Allowing to access info about the important lsns (curr and durable)
01523     static rc_t            get_curr_lsn(lsn_t& anlsn);
01524     static rc_t            get_durable_lsn(lsn_t& anlsn);
01525 
01526 
01527     /*
01528        Device and Volume Management
01529        ----------------------------
01530        A device is either an operating system file or operating system
01531        device and is identified by a path name (absolute or relative).
01532        A device has a quota.  In theory, a device may have 
01533        multiple volumes on it but
01534        in the current implementation the maximum number of volumes
01535        is 1.
01536 
01537        A volume is where data is stored.  A volume is identified
01538        uniquely and persistently by a long volume ID (lvid_t).
01539        Volumes can be used whenever the device they are located
01540        on is mounted by the SM.  Volumes have a quota.  The
01541        sum of the quotas of all the volumes on a device cannot
01542        exceed the device quota.
01543 
01544        The basic steps to begin using a new device/volume are:
01545         format_dev: initialize the device
01546         mount_dev: allow use of the device and all its volumes
01547         generate_new_lvid: generate a unique ID for the volume
01548         create_vol: create a volume on the device
01549      */
01550 
01551     /*
01552      * Device management functions
01553      */
01554      /**\addtogroup SSMVOL 
01555       * The storage manager was designed to permit multiple \e volumes
01556       * on a \e device, with \e volume analogous to a Unix \e parition and
01557       * a \e device analogous to a disk, and the original SHORE contained
01558       * symmetric peer servers.  
01559       * However good that intention, multiple volumes on a device were never
01560       * implemented, and times have changed, and the storage manager no
01561       * longer has any notion of remote and local volumes.
01562       * The notion a volume, separate from a device, remains, but may
01563       * some day disappear.
01564       *
01565       * For the time being, a device contains at most one volume. 
01566       *
01567      * A device is either an operating system file or 
01568      * an operating system device (e.g., raw disk partition) and  
01569      * is identified by a path name (absolute or relative).
01570      *
01571      * A device has a quota.  
01572      * A device is intended to have multiple volumes on it, but
01573      * in the current implementation the maximum number of volumes
01574      * is exactly 1.
01575      *
01576      * A volume is where data are stored.  
01577      * Each volume is a header and a set of pages. All pages are
01578      * the same size (this is a compile-time constant, the default being
01579      * 8K and sizes up to 64K permissible).
01580      *
01581      * A volume is identified uniquely and persistently by a 
01582      * long volume ID (lvid_t), which is stored in its header.
01583      * Volumes can be used whenever the device they are located
01584      * on is mounted by the SM.  
01585      * Volumes have a quota.  The
01586      * sum of the quotas of all the volumes on a device cannot
01587      * exceed the device quota.
01588      *
01589      * A volume contains a variety of data structures. All user
01590      * data reside in \e stores.  A store is a collection of the
01591      * pages on the volume, allocated in \e extents of a size that
01592      * is a compile-time constant. (The storage manager has only
01593      * been tested with an extent-size of 8 pages. The compile-time constant
01594      * can be changed, but it also requires changes elsewhere in the code
01595      * to maintain alignment of persistent structures.
01596      * See the comments in config/shore.def.) Thus, the minimum size
01597      * of a store is one extent's worth of pages.
01598      * Larger extents provide better clustering, but more wasted space if
01599      * small files and small indexes will be common.
01600      *
01601      * Stores are identified by a store number (snum_t).
01602      *
01603      * Each volume contains a few stores that are "overhead":
01604      * 0 -- is reserved for an extent map and a store map
01605      * 1 -- directory (dir_m)
01606      * 2 -- root index 
01607      *
01608      * Beyond that, for each (user) file created, 2 stores are used, one for
01609      * small objects, one for large objects, and for each index (btree, rtree) 
01610      * created 1 store is used.
01611      *
01612      * Each volume is laid out thus:
01613      * - volume header, which identifies the number of extents on
01614      *   the volume, determined when the volume is formatted.
01615      *   This is always in page 1 of the volume.
01616      * - store map: some number of pages describing the stores on the volume,
01617      *   namely, being the heads of linked-lists of extents that make up
01618      *   the stores. The number of such pages is determined when the
01619      *   volume is formatted.  The worst case is assumed, which is one
01620      *   might fill the volume with one-extent stores.
01621      * - extent map: some number of pages of bitmaps, one bitmap for each 
01622      *   extent,  describe which pages in the extents are allocated or free.
01623      * - data pages: the rest of the volume.
01624      *
01625      */
01626 
01627     /**\brief Format a device.
01628      * \ingroup SSMVOL
01629      * \details
01630      * @param[in] device   Operating-system file name of the "device".
01631      * @param[in] quota_in_KB  Quota in kilobytes.
01632      * @param[in] force If true, format the device even if it already exists.
01633      *
01634      * Since raw devices always "exist", \a force should be given as true 
01635      * for raw devices.
01636      *
01637      * A device may not be formatted if it is already mounted.
01638      *
01639      * \note This method should \b not 
01640      * be called in the context of a transaction.
01641      */
01642     static rc_t            format_dev(
01643         const char*            device,
01644         smksize_t              quota_in_KB,
01645         bool                   force);
01646     
01647     /**\brief Mount a device.
01648      * \ingroup SSMVOL
01649      * \details
01650      * @param[in] device   Operating-system file name of the "device".
01651      * @param[out] vol_cnt Number of volumes on the device.
01652      * @param[out] devid  A local device id assigned by the storage manager.
01653      * @param[in] local_vid A local handle to the (only) volume on the device,
01654      * to be used when a volume is mounted.  The default, vid_t::null, 
01655      * indicates that the storage manager can chose a value for this. 
01656      *
01657      * \note It is fine to mount a device more than once, as long as device
01658      * is always the same (you cannot specify a hard link or soft link to
01659      * an entity mounted under a different path). 
01660      * Device mounts are \b not reference-counted, so a single dismount_dev
01661      * renders the volumes on the device unusable.
01662      *
01663      * \note This method should \b not 
01664      * be called in the context of a transaction.
01665      */
01666     static rc_t            mount_dev(
01667         const char*            device,
01668         u_int&                 vol_cnt,
01669         devid_t&               devid,
01670         vid_t                  local_vid = vid_t::null);
01671 
01672     /**\brief Dismount a device.
01673      * \ingroup SSMVOL
01674      * \details
01675      * @param[in] device   Operating-system file name of the "device".
01676      *
01677      * \note It is fine to mount a device more than once, as long as device
01678      * is always the same (you cannot specify a hard link or soft link to
01679      * an entity mounted under a different path). 
01680      * Device mounts are \b not reference-counted, so a single dismount_dev
01681      * renders the volumes on the device unusable.
01682      *
01683      * \note This method should \b not 
01684      * be called in the context of a transaction.
01685      */
01686 
01687     static rc_t            dismount_dev(const char* device);
01688 
01689     /**\brief Dismount all mounted devices.
01690      * \ingroup SSMVOL
01691      *
01692      * \note This method should \b not 
01693      * be called in the context of a transaction.
01694      */
01695     static rc_t            dismount_all();
01696 
01697     // list_devices returns an array of char* pointers to the names of
01698     // all mounted devices.  Note that the use of a char*'s is 
01699     // a temporary hack until a standard string class is available.
01700     // the char* pointers are pointing directly into the device
01701     // mount table.
01702     // dev_cnt is the length of the list returned.
01703     // dev_list and devid_list must be deleted with delete [] by the
01704     // caller if they are not null (0).  They should be null
01705     // if an error is returned or if there are no devices.
01706     /**\brief Return a list of all mounted devices.
01707      * \ingroup SSMVOL
01708      * \details
01709      * @param[out] dev_list   Returned list of pointers directly into the mount table.
01710      * @param[out] devid_list   Returned list of associated device ids.
01711      * @param[out] dev_cnt   Returned number of entries in the two above lists.
01712      *
01713      * The storage manager allocates the arrays returned with new[], and the
01714      * caller must return these to the heap with delete[] if they are not null.
01715      * They will be null if an error is returned or if no devices are mounted.
01716      *
01717      * The strings to which dev_list[*] point are \b not to be deleted by
01718      * the caller.
01719      */
01720     static rc_t            list_devices(
01721         const char**&            dev_list, 
01722         devid_t*&                devid_list, 
01723         u_int&                   dev_cnt);
01724 
01725     /**\brief Return a list of all volume on a device.
01726      * \ingroup SSMVOL
01727      * \details
01728      * @param[in] device   Operating-system file name of the "device".
01729      * @param[out] lvid_list   Returned list of pointers directly into the mount table.
01730      * @param[out] lvid_cnt   Returned length of list lvid_list.
01731      *
01732      * The storage manager allocates the array lvid_list 
01733      * with new[], and the
01734      * caller must return it to the heap with delete[] if it is not null.
01735      * It will be null if an error is returned. 
01736      *
01737      * \note This method should \b not 
01738      * be called in the context of a transaction.
01739      */
01740     static rc_t            list_volumes(
01741         const char*            device,
01742         lvid_t*&               lvid_list,
01743         u_int&                 lvid_cnt
01744     );
01745 
01746     // get_device_quota the "quota" (in KB) of the device
01747     // and the amount of the quota allocated to volumes on the device.
01748     /**\brief Get the device quota.
01749      * \ingroup SSMVOL
01750      * \details
01751      * @param[in] device   Operating-system file name of the "device".
01752      * @param[out] quota_KB   Returned quota in kilobytes
01753      * @param[out] quota_used_KB   Returned portion of quota allocated to volumes
01754      *
01755      * The quota_used_KB is the portion of the quota allocated to volumes on the device.
01756      *
01757      * \note This method \b may 
01758      * be called in the context of a transaction.
01759      *
01760      * \note This method \b may 
01761      * be called in the context of a transaction.
01762      */
01763     static rc_t            get_device_quota(
01764         const char*             device, 
01765         smksize_t&              quota_KB, 
01766         smksize_t&              quota_used_KB);
01767 
01768 
01769     /*
01770      * Volume management functions
01771      */
01772 
01773     /**\brief Change the fake disk latency before I/Os on this volume, 
01774      * for debugging purposes
01775      * \ingroup SSMVOL
01776      * \details
01777      * @param[in] vid  The ID of the volume of interest.
01778      * @param[in] adelay  Nanoseconds to sleep with ::nanosleep()
01779      *
01780      * This is for debugging only.
01781      * Changing the value of the latency for a volume does not enable the
01782      * delay.
01783      */
01784     static rc_t set_fake_disk_latency(vid_t vid, const int adelay);
01785 
01786     /**\brief Enable the fake disk latency before I/Os on this volume, for debugging purposes
01787      * \ingroup SSMVOL
01788      * \details
01789      * @param[in] vid  The ID of the volume of interest.
01790      *
01791      * This is for debugging only.
01792      * When this is enabled, is uses whatever disk latency was set with
01793      * ss_m::create_vol() or the last applied ss_m::set_fake_disk_latency().
01794      */
01795     static rc_t enable_fake_disk_latency(vid_t vid);
01796     /**\brief Disable the fake disk latency before I/Os on this volume, for debugging purposes
01797      * \ingroup SSMVOL
01798      * \details
01799      * @param[in] vid  The ID of the volume of interest.
01800      *
01801      * This is for debugging only.
01802      */
01803     static rc_t disable_fake_disk_latency(vid_t vid);
01804 
01805 
01806     /**\brief Add a volume to a device.
01807      * \ingroup SSMVOL
01808      * \details
01809      * @param[in] lvid  Long volume id to be used on ss_m::create_vol().
01810      *
01811      * This generates a unique volume identifier to be written persistently
01812      * on the volume when it is formatted.
01813      * This enables us to avoid the mistake of doubly-mounting a volume.
01814      * The identifer is constructed from the machine network address and the
01815      * time of day.
01816      */
01817     static rc_t generate_new_lvid(lvid_t& lvid);
01818      
01819     /**\brief Add a volume to a device.
01820      * \ingroup SSMVOL
01821      * \details
01822      * @param[in] device_name   Operating-system file name of the "device".
01823      * @param[in] lvid  Long volume id to use when formatting the new volume.
01824      * @param[in] quota_KB  Quota in kilobytes.
01825      * @param[in] skip_raw_init  Do not initialize the volume if on a raw device.
01826      * @param[in] local_vid Short volume id by which to refer to this volume.
01827      *            If null, the storage manager will assign one.
01828      * @param[in] apply_fake_io_latency See ss_m::enable_fake_disk_latency()
01829      * @param[in] fake_disk_latency See ss_m::set_fake_disk_latency()
01830      *
01831      * \note This method should \b not 
01832      * be called in the context of a transaction.
01833      *
01834      * The pages on the volume \b must be zeroed; you can only use
01835      * \a skip_raw_init = true if you have by some other means
01836      * already initialized the volume.
01837      */
01838     static rc_t            create_vol(
01839         const char*             device_name,
01840         const lvid_t&           lvid,
01841         smksize_t               quota_KB,
01842         bool                    skip_raw_init = false,
01843         vid_t                   local_vid = vid_t::null,
01844         const bool              apply_fake_io_latency = false,
01845         const int               fake_disk_latency = 0);
01846 
01847     /**\brief Destroy a volume.
01848      * \ingroup SSMVOL
01849      * \details
01850      * @param[in] lvid  Long volume id by which the volume is known.
01851      *
01852      * \note This method should \b not 
01853      * be called in the context of a transaction.
01854      */
01855     static rc_t            destroy_vol(const lvid_t& lvid);
01856 
01857     /**\brief Gets the quotas associated with the volume.
01858      * \ingroup SSMVOL
01859      * @param[in] lvid  Long volume id by which the volume is known.
01860      * @param[out] quota_KB  Quota given when the volume was created.
01861      * @param[out] quota_used_KB  Portion of the quota has been used by
01862      * allocated extents.
01863      */
01864     static rc_t            get_volume_quota(
01865         const lvid_t&             lvid, 
01866         smksize_t&                quota_KB, 
01867         smksize_t&                quota_used_KB);
01868 
01869     /**\cond skip */
01870     // check_volume_page_types: strictly for debugging/testing
01871     static rc_t             check_volume_page_types(vid_t vid);
01872     /**\endcond skip */
01873 
01874 
01875     /**\brief Analyze a volume and report statistics regarding disk usage.
01876      * \ingroup SSMVOL
01877      * @param[in] vid The volume of interest.
01878      * @param[out] du The structure that will hold the collected statistics.
01879      * @param[in] audit If "true", the method acquires a share lock on the
01880      * volume and then will check assertions about the
01881      * correctness of the data structures on the volume. 
01882      * If the audit fails an internal fatal error is generated 
01883      * to facilitate debugging. (It will generate a core file if your
01884      * shell permits such.)
01885      * If "false" an IS lock is acquired, which means that the
01886      * statistics will be fuzzy.
01887      *
01888      * Using the audit feature is useful for debugging.
01889      * It is the only safe way to use this method.
01890      * \note The statistics are added to the sm_du_stats_t structure passed in.
01891      * This structure is not cleared by the storage manager.
01892      */
01893     static rc_t            get_du_statistics(
01894         vid_t                 vid,
01895         sm_du_stats_t&        du,
01896         bool                  audit = true); 
01897 
01898     /**\brief Analyze a store and report statistics regarding disk usage.
01899      * \ingroup SSMVOL
01900      * @param[in] stid The store of interest.
01901      * @param[out] du The structure that will hold the collected statistics.
01902      * @param[in] audit If "true", the method acquires a share lock on the
01903      * store and then will check assertions about the
01904      * correctness of the data structures on the store. 
01905      *
01906      * Using the audit feature is useful for debugging.
01907      * It is the only safe way to use this method.
01908      *
01909      */
01910     static rc_t            get_du_statistics(
01911         const stid_t&        stid, 
01912         sm_du_stats_t&       du,
01913         bool                 audit = true);
01914     
01915     /**\brief Dump disk information about the indicated volume.
01916      * \ingroup SSMVOL
01917      * @param[in] vid The volume of interest.
01918      *
01919      * This function is for debugging.
01920      * It dumps, to the error log, at info_prio priority,
01921      * metadata about the given volume, including the number of extents
01922      * on the volume, the extent size, and the number of pages dedicated
01923      * to store maps and extent maps. Then, for each store on the volume,
01924      * it dumps the status of the store and the extents allocated to 
01925      * that store.
01926      *
01927      * This function must be run in a transaction, though the function
01928      * is read-only.
01929      */
01930     static rc_t            dump_vol_store_info(const vid_t &vid);
01931 
01932     /**\brief Analyze  a volume and collect brief statistics about its usage.
01933      * \ingroup SSMVOL
01934      * @param[in] vid The volume of interest.
01935      * @param[out] volume_stats The statistics are written here.
01936      * @param[in] cc Indicates whether the volume is to be locked 
01937      * by this method. Acceptable values are t_cc_none and t_cc_volume.
01938      *
01939      * If no lock is acquired, the method can fail with eRETRY.
01940      *
01941      */
01942     static rc_t            get_volume_meta_stats(
01943         vid_t                vid,
01944         SmVolumeMetaStats&   volume_stats,
01945         concurrency_t        cc = t_cc_none
01946     );
01947 
01948     /**\brief Analyze  a volume and collect brief statistics about its usage.
01949      * \ingroup SSMVOL
01950      * @param[in] vid The volume of interest.
01951      * @param[in] num_files The size of the array file_stats.
01952      * @param[out] file_stats Preallocated array of structs into which to
01953      * write the statistics for the individual files inspected.
01954      * @param[in] batch_calculate  True means make one pass over the volume.
01955      * @param[in] cc Indicates whether the volume is to be locked 
01956      * by this method. Acceptable values are t_cc_none and t_cc_volume.
01957      *
01958      * If no lock is acquired and batch_calculate is not set, 
01959      * the method can fail with eRETRY.
01960      *
01961      *
01962      * If batch_calculate is true then this works by making one pass
01963      * over the meta data, but it looks at all the meta data.  This
01964      * should be the faster way to do the analysis when there are 
01965      * many files, and when files use a large portion of the volume.
01966      *
01967      * If batch_calculate is false then each file is updated
01968      * indidually, only looking at the extent information for that
01969      * particular file. This requires a pass over the volume for each
01970      * file. (Seek-wise it is less efficient).
01971      *
01972      */
01973     static rc_t            get_file_meta_stats(
01974         vid_t                vid,
01975         w_base_t::uint4_t    num_files,
01976         SmFileMetaStats*     file_stats,
01977         bool                 batch_calculate = false,
01978         concurrency_t        cc = t_cc_none
01979     );
01980    
01981     /**\brief Get the index ID of the root index of the volume.
01982      * \ingroup SSMVOL
01983      *
01984      * @param[in] v Volume of interest.
01985      * @param[out] iid Store ID of the root index.
01986      * \details
01987      *
01988      * Each volume has a root index, which is a well-known
01989      * index available to the server for bootstrapping a database.
01990      *
01991      */
01992     static rc_t            vol_root_index(
01993         const vid_t&        v, 
01994         stid_t&             iid
01995     )    { iid.vol = v; iid.store = store_id_root_index; return RCOK; }
01996 
01997     /*****************************************************************
01998      * storage operations: smfile.cpp
01999      *****************************************************************/
02000     /**\addtogroup SSMSTORE 
02001      * Indexes and files are special cases of "stores".
02002      * A store is a linked list of extents, and an extent is a
02003      * contiguous group of pages.  So the store is the structure
02004      * that holds together an ordered set of pages that can be
02005      * used by a server and have an identifier (a store ID or stid_t).
02006      *
02007      * Indexes and files of records are built on stores.
02008      *
02009      * Stores have logging properties and 
02010      * other metadata associated with them.
02011      * 
02012      * The property that determines the logging level of the store is
02013      * \ref sm_store_property_t.
02014      *
02015      * Methods that let you get and change the metatdata are:
02016      * - ss_m::get_store_property
02017      * - ss_m::set_store_property
02018      * - ss_m::get_store_info
02019      * - \ref snum_t
02020      *
02021      * When a transaction deletes a file or index, the deletion of the
02022      * underlying stores is delayed until the transaction commits so that
02023      * the pages allocated to the stores remain reserved (lest the
02024      * transaction aborts). The deleting transaction could, in theory,
02025      * reuse the pages for another store, but in practice that is not done.
02026      * Instead, when a store is deleted, the store is marked
02027      * for deletion an put in a list for the transaction to delete upon
02028      * commit.   At commit time, stores that have property t_load_file
02029      * or t_insert_file are converted to t_regular.
02030      */
02031 
02032     /**\brief Change the store property of a file or index.
02033      * \ingroup SSMSTORE
02034      * @param[in] stid   File ID or index ID of the store to change.
02035      * @param[in] property   Enumeration store_property_t (alias for
02036      *                   smlevel_3::sm_store_property_t, q.v.)
02037      *
02038      * \details
02039      * The possible uses of store properties are described with 
02040      * smlevel_3::sm_store_property_t.
02041      */
02042     static rc_t            set_store_property(
02043         stid_t                stid,
02044         store_property_t      property
02045         );
02046 
02047     /**\brief Get the store property of a file or index.
02048      * \ingroup SSMSTORE
02049      * @param[in] stid   File ID or index ID of the store of interest.
02050      * @param[in] property   Reference to enumeration store_property_t 
02051      *                  (alias for smlevel_3::sm_store_property_t, q.v.)
02052      *
02053      * \details
02054      * The possible uses of store properties are described with 
02055      * smlevel_3::sm_store_property_t.
02056      */
02057     static rc_t            get_store_property(
02058         stid_t                stid,
02059         store_property_t&     property);
02060 
02061     /**\brief Get various store information of a file or index.
02062      * \ingroup SSMSTORE
02063      * @param[in] stid   File ID or index ID of the store of interest.
02064      * @param[out] info  Reference to sm_store_info_t into which to
02065      * write the results.
02066      *
02067      * \details
02068      * Get internally stored information about a store.
02069      */
02070     static rc_t            get_store_info( 
02071         const stid_t&         stid, 
02072         sm_store_info_t&      info);
02073 
02074     //
02075     // Functions for B+tree Indexes
02076     //
02077     /**\addtogroup SSMBTREE 
02078      * The storage manager supports B+-Tree indexes provide associative access 
02079      * to data by associating keys with values in 1:1 or many:1 relationships.
02080      * Keys may be composed of any of the basic C-language types (integer,
02081      * unsigned, floating-point of several sizes) or
02082      * variable-length character strings (wide characters are \b not supported).
02083      *
02084      * The number of key-value pairs that an index can hold is limited by the
02085      * space available on the volume containing the index.
02086      * \anchor max_entry_size 
02087      * The combined sizes of the key and value must
02088      * be less than or equal to \ref max_entry_size, which is
02089      * a function of the page size, and is 
02090      * such that two entries of this size fit on a page along with all
02091      * the page and entry metadata.  See sm_config_info_t and ss_m::config_info.
02092      *
02093      * The minimum size of a B-Tree index is 8 pages (1 extent).
02094      *
02095      * A variety of locking protocols is supported:
02096      * - none : acquire no locks on the {key,value} pairs in the index,
02097      *   although an intention lock might be acquired on the index.
02098      * - kvl : key-value locking See \ref MOH1.  The key or
02099      *   key-value pair is hashed into a 4-byte value and used with the
02100      *   given store id to make a lock id.
02101      * - im : index-management locking See \ref MOH1.  
02102      *   The "value" portion of
02103      *   the key-value lock is taken to be a record id, which is used 
02104      *   for the lock id.
02105      * - modified kvl : an ad-hoc protocol used by the Paradise project. See \ref MODKVL "the scan_index_i constructor". As with index-management locking, 
02106      *   the "value" portion of
02107      *   the key-value lock is taken to be a record id, which is used 
02108      *   for the lock id.
02109      * - file : full-index locking.
02110      *
02111      * \section key_description Key Types
02112      * A B+-Tree index key has a type determined when the index is created.
02113      * All keys are stored in lexicographic format based on an interpretation of
02114      * the key determined by the key description given when the index is
02115      * created.
02116      * Lookups on the B+-Tree then involve a single byte-by-byte
02117      * comparison of two byte-strings, each composed of its concatenated
02118      * sub-keys.
02119      *
02120      * The key description is a null-terminated string as follows:
02121      \verbatim
02122      <key_decription>     ::=  <fixed_len_part>*  <variable_len_part>  |
02123                                <fixed_len_part>+ 
02124      <fixed_len_part>     ::=  <type> <len> 
02125      <variable_len_part>  ::=  <type> '*' <len>
02126      <type>               ::=  'i' | 'u' | 'f' | 'b' | 'I' | 'U' | 'F' | 'B'
02127      <len>                ::=   [1-9][0-9]*
02128      \endverbatim
02129      * Thus, a key may have any number of fixed-length parts followed by at
02130      * most one variable-length part.
02131      *
02132      * The fixed-length parts (if present) consist of a type and a length.
02133      *
02134      * The variable-length part (if present) consists of a type and a length
02135      * separated by an asterisk, which is what distinguishes a variable-length
02136      * from a fixed-length part.
02137      *
02138      * Types and permissible lengths are:
02139      * - integer (1,2,4,8)
02140      * - unsigned (1,2,4,8)
02141      * - floating (4,8)
02142      * - uninterpreted byte (any length greater than zero)
02143      *
02144      * A capital letter indicates that the key part may be compressed. Only prefix
02145      * compression is implemented, so it makes sense to compress if the
02146      * first part of the key is compressible.
02147      *
02148      * Examples:
02149      * - "B40u4u2u2" : 40-byte character string followed by a 4-byte integer,
02150      *                 a 2-byte integer and a 2-byte integer, such as one might
02151      *                 use for name.year.mo.day.  The character string is
02152      *                 prefix-compressed.
02153      * - "f8"        : an 8-byte floating-point number (double)
02154      * - "I8B*1000"  : An 8-byte integer followed by an uninterpreted string
02155      *                 of up to 1000 bytes, all prefix-compressed.
02156      *
02157      * \note Wide characters are not supported.
02158      *
02159      * This key descriptor is stored in the sm_store_info_t, which is
02160      * stored on the volume and is available with the method ss_m::get_store_info.
02161      * Keys are stored in \ref LEXICOFORMAT "lexicographic format". The
02162      * storage manager knows how to convert all the key types listed above.
02163      * When duplicates are permitted, the index assumes that the elements
02164      * are in lexicographic order when searching for a <key,element> pair.
02165      *
02166      * \section XXXX1 Bulk Loading 
02167      * Bulk-loading of all index types is supported. See \ref SSMBULKLD.
02168      */
02169 
02170 
02171     /**\brief Create a B+-Tree index.
02172      * \ingroup SSMBTREE
02173      * @param[in] vid   Volume on which to create the index.
02174      * @param[in] ntype   Type of index. Legitimate values are: 
02175      *  - t_btree : B+-Tree with duplicate keys allowed
02176      *  - t_uni_btree : B+-Tree without duplicate keys 
02177      * @param[in] property Logging level of store. Legitimate values are:
02178      *  - t_regular
02179      *  - t_load_file
02180      *  - t_insert_file
02181      *  See sm_store_property_t for details.
02182      * @param[in] key_desc Description of key type.
02183      *  See \ref key_description for details.
02184      * @param[in] cc The locking protocol to use with this index. See
02185      * smlevel_0::concurrency_t and \ref SSMBTREE.
02186      * @param[out] stid New store ID will be returned here.
02187      */
02188     static rc_t            create_index(
02189                 vid_t                 vid, 
02190                 ndx_t                 ntype, 
02191                 store_property_t      property,
02192                 const char*           key_desc,
02193                 concurrency_t         cc, 
02194                 stid_t&               stid
02195     );
02196 
02197     /**\brief Create a B+-Tree or R*-Tree index.
02198      * \ingroup SSMBTREE
02199      *\attention For backward compatibility. Will be deprecated later.
02200      */
02201     static rc_t            create_index(
02202                 vid_t                 vid, 
02203                 ndx_t                 ntype, 
02204                 store_property_t      property,
02205                 const char*           key_desc,
02206                 stid_t&               stid
02207     );
02208 
02209     /**\brief Destroy a B+-Tree index.
02210      * \ingroup SSMBTREE
02211      *
02212      * @param[in] iid  ID of the index to be destroyed.
02213      */
02214     static rc_t            destroy_index(const stid_t& iid); 
02215 
02216     /**\brief Bulk-load a B+-Tree index from multiple data sources.
02217      * \ingroup SSMBULKLD
02218      *
02219      * @param[in] stid  ID of the index to be loaded.
02220      * @param[in] nsrcs  Number of files used for data sources.
02221      * @param[in] source  Array of IDs of files used for data sources.
02222      * @param[out] stats  Statistics concerning the load activity will be
02223      *                     written here.
02224      * @param[in] sort_duplicates  If "true" the bulk-load will sort
02225      * duplicates by value.
02226      * @param[in] lexify_keys  If "true" the keys are assumed not to
02227      * be in 
02228      * lexicographic format, and the bulk-load will reformat the key before
02229      * storing it in the index,
02230      * otherwise they are assumed already to be in lexicographic format.
02231      *
02232      * \anchor LEXICOFORMAT 
02233      * \b Lexicographic \b format
02234      * is the translation of numbers 
02235      * (int, float, double, unsigned, etc) into byte strings
02236      * such that a lexicographic comparison of the byte strings
02237      * yields the same result as the numeric comparison of the
02238      * original data.
02239      *
02240      * \note The data must already have been sorted by 
02241      * key in lexicographic format, but the keys themselves don't have
02242      * to be in lexicographic format; if the keys are not already in
02243      * lexicographic format, the \a lexify_keys must be given the value "true".
02244      *
02245      * In the case of duplicate keys, the bulk-load will handle the
02246      * sorting of the elements if \a sort_duplicates is "true"; this
02247      * sort will be done by a lexicographic comparison of the 
02248      * byte strings that compose the elements.
02249      */
02250     static rc_t            bulkld_index(
02251         const stid_t&             stid, 
02252         int                       nsrcs,
02253         const stid_t*             source,
02254         sm_du_stats_t&            stats,
02255         bool                      sort_duplicates = true,
02256         bool                      lexify_keys = true
02257     );
02258     /**\brief Bulk-load a B+-Tree index from a single data source.
02259      * \ingroup SSMBULKLD
02260      *
02261      * @param[in] stid  ID of the index to be loaded.
02262      * @param[in] source  IDs of file used for data source.
02263      * @param[out] stats  Statistics concerning the load activity will be
02264      *                     written here.
02265      * @param[in] sort_duplicates  If "true" the bulk-load will sort
02266      * duplicates by value.
02267      * @param[in] lexify_keys  If "true" the keys are assumed not to
02268      * be in 
02269      * lexicographic format, and the bulk-load will reformat the key before
02270      * storing it in the index,
02271      * otherwise they are assumed already to be in lexicographic format.
02272      */
02273     static rc_t            bulkld_index(
02274         const stid_t&             stid, 
02275         const stid_t&             source,
02276         sm_du_stats_t&            stats,
02277         bool                      sort_duplicates = true,
02278         bool                      lexify_keys = true
02279     );
02280     /**\brief Bulk-load a B+-Tree index from a single data stream.
02281      * \ingroup SSMBULKLD
02282      *
02283      * @param[in] stid  ID of the index to be loaded.
02284      * @param[in] sorted_stream  Iterator that serves as the data source.
02285      * @param[out] stats  Statistics concerning the load activity will be
02286      *                     written here.
02287      *
02288      * See sort_stream_i.
02289      */
02290     static rc_t            bulkld_index(
02291         const stid_t&             stid, 
02292         sort_stream_i&            sorted_stream,
02293         sm_du_stats_t&            stats);
02294 
02295     /**\cond skip */
02296     static rc_t            print_index(stid_t stid);
02297     /**\endcond skip */
02298 
02299     /**\brief Create an entry in a B+-Tree index.
02300      * \ingroup SSMBTREE
02301      *
02302      * @param[in] stid  ID of the index. 
02303      * @param[in] key  Key for the association to be created.
02304      * @param[in] el  Element for the association to be created.
02305      *
02306      * The combined sizes of the key and element vectors must
02307      * be less than or equal to \ref max_entry_size.
02308      */
02309     static rc_t            create_assoc(
02310         stid_t                   stid, 
02311         const vec_t&             key, 
02312         const vec_t&             el
02313 #ifdef SM_DORA
02314         , const bool             bIgnoreLocks = false
02315 #endif
02316     );
02317     /**\brief Remove an entry from a B+-Tree index.
02318   * If your index is non-unique (i.e., it may contain
02319   * multiple entries per key), use destroy_all_assoc.
02320   *
02321      * \ingroup SSMBTREE
02322      *
02323      * @param[in] stid  ID of the index. 
02324      * @param[in] key   Key of the entry to be removed.
02325      * @param[in] el   Element (value) of the entry to be removed.
02326      */
02327     static rc_t            destroy_assoc(
02328         stid_t                   stid, 
02329         const vec_t&             key,
02330         const vec_t&             el
02331 #ifdef SM_DORA
02332         , const bool             bIgnoreLocks = false
02333 #endif
02334     );
02335     /**\brief Destroy all entries associated with a key in a B+-Tree index.
02336      * \ingroup SSMBTREE
02337      *
02338      * @param[in] stid  ID of the index. 
02339      * @param[in] key   Key of the entries to be removed.
02340      * @param[out] num_removed   The number of entries removed is returned here.
02341      */
02342     static rc_t            destroy_all_assoc(
02343         stid_t                  stid, 
02344         const vec_t&            key,
02345         int&                    num_removed
02346     );
02347     /**\brief Find an entry associated with a key in a B+-Tree index. 
02348      * \ingroup SSMBTREE
02349      *
02350      * @param[in] stid  ID of the index. 
02351      * @param[in] key   Key of the entries to be removed.
02352      * @param[out] el   Element associated with the given key will be copied into this buffer.
02353      * @param[in] elen Length of buffer into which the 
02354      *                  result will be written. If too small, eRECWONTFIT will
02355      *                  be returned.
02356      *                 Length of result will be returned here.
02357      * @param[out] found   True if an entry is found.
02358      *
02359      * If the index is not unique (allows duplicates), the first
02360      * element found with the given key will be returned.
02361      *
02362      * To locate all entries associated with a non-unique key, you must
02363      * use scan_index_i, q.v.. 
02364      */
02365     static rc_t            find_assoc(
02366         stid_t                  stid, 
02367         const vec_t&            key, 
02368         void*                   el, 
02369         smsize_t&               elen, 
02370         bool&                   found
02371 #ifdef SM_DORA
02372         , const bool             bIgnoreLocks = false
02373 #endif
02374     );
02375 
02376     //
02377     // Functions for R*tree (multi-dimensional(MD), spatial) Indexes
02378     //
02379 
02380     /**\addtogroup SSMRTREE 
02381      *
02382      * An R-tree is a height-balanced structure designed for indexing
02383      * multi-dimensional spatial objects.  
02384      * It stores the minimial bounding box (with 2 or higher dimension) of 
02385      * a spatial object as the key in the leaf pages.
02386      * This implementation is a variant of an R-Tree called an R*-Tree, which
02387      * improves the search performance by using a heuristic for redistributing
02388      * entries and dynamically reorganizing the tree during insertion.
02389      *
02390      * An R*-Tree stores key,value pairs where the key is of type nbox_t
02391      * and the value is of type vec_t.
02392      *
02393      * The number of key-value pairs an index can hold is limited by the space
02394      * available on the volume containing the index.
02395      * The minimum size of an R*-tree index is 8 pages.
02396      *
02397      * 
02398      * \note This implementation 
02399      * uses coarse-grained (index-level) locking and 
02400      * supports only 2 dimensions and integer coordinates.
02401      * For information about R*-trees, see the \ref BKSS.
02402      *
02403      * Example:
02404      * \code
02405      scan_rt_i scan(idx, nbox_t::t_overlap, universe, true);
02406      bool      eof;
02407      nbox_t    k;
02408      char*     e;
02409      smsize_t  elen;
02410 
02411      for(int i=0; 
02412              (!(rc = scanp->next(k,e,elen,eof)).is_error() && !eof);
02413              i++) ;
02414      cout << "Rtree " << idx << " contains " << i << " entries." << endl;
02415      \endcode
02416      * 
02417      *
02418      * \section XXXX2 Bulk Loading 
02419      * Bulk-loading of all index types is supported. See \ref SSMBULKLD.
02420      */
02421      /*\example rtree_example.cpp*/
02422 
02423 
02424     /**\brief Create an R*-Tree (multi-dimensional spatial) index.
02425   * The storage manager does not provide
02426   * complete support for non-unique multidimensional indexes.
02427   * While you may insert multiple (distinct) entries for the same key in 
02428   * a multi-dimensional index, you will not be able to use them; only
02429   * the first can be retrieved.  
02430      * \ingroup SSMRTREE
02431      * @param[in] vid   Volume on which to create the index.
02432      * @param[in] ntype   Type of index. Legitimate values are: 
02433      *  - t_rtree : R*-Tree 
02434      * @param[in] property Logging level of store. Legitimate values are:
02435      *  - t_temporary
02436      *  - t_regular
02437      *  - t_load_file
02438      *  - t_insert_file
02439      *  See sm_store_property_t for details.
02440      * @param[in] dim Number of dimensions of the key.
02441      * They key type is an nbox_t.
02442      * See \ref nbox_t for details. 
02443      * @param[out] stid New store ID will be returned here.
02444      */
02445     static rc_t            create_md_index(
02446         vid_t                   vid, 
02447         ndx_t                   ntype, 
02448         store_property_t        property,
02449         stid_t&                 stid, 
02450         int2_t                  dim = 2
02451     );
02452 
02453     /**\brief Destroy an R*-Tree index.
02454      * \ingroup SSMRTREE
02455      *
02456      * @param[in] iid  ID of the index to be destroyed.
02457      */
02458     static rc_t            destroy_md_index(const stid_t& iid);
02459 
02460     /**\brief Bulk-load a multi-dimensional index from multiple sources.
02461      * \ingroup SSMBULKLD
02462      * @param[in] stid  ID of the index to be loaded.
02463      * @param[in] nsrcs  Number of files used for data sources.
02464      * @param[in] source  Array of IDs of files used for data sources.
02465      * @param[out] stats  Statistics concerning the load activity will be
02466      *                     written here.
02467      * @param[in] hff   Heuristic fill factor. Not used.
02468      * @param[in] hef   Heuristic expansion factor. Not used.
02469      * @param[in] universe  Universal bounding box of all spatial objects indexed.
02470     */
02471     static rc_t            bulkld_md_index(
02472         const stid_t&             stid, 
02473         int                       nsrcs,
02474         const stid_t*             source, 
02475         sm_du_stats_t&            stats,
02476         int2_t                    hff=75,
02477         int2_t                    hef=120,
02478         nbox_t*                   universe=NULL);
02479 
02480     /**\brief Bulk-load a multi-dimensional index from a single source.
02481   * The storage manager does not provide
02482   * complete support for non-unique multidimensional indexes.
02483   * While you may insert multiple (distinct) entries for the same key in 
02484   * a multi-dimensional index, you will not be able to use them; only
02485   * the first can be retrieved.  
02486      * \ingroup SSMBULKLD
02487      * @param[in] stid  ID of the index to be loaded.
02488      * @param[in] source  ID of file to be used for data source.
02489      * @param[out] stats  Statistics concerning the load activity will be
02490      *                     written here.
02491      * @param[in] hff   Heuristic fill factor. Not used.
02492      * @param[in] hef   Heuristic expansion factor. Not used.
02493      * @param[in] universe  Universal bounding box of all spatial objects indexed.
02494     */
02495     static rc_t            bulkld_md_index(
02496         const stid_t&             stid, 
02497         const stid_t&             source, 
02498         sm_du_stats_t&            stats,
02499         int2_t                    hff=75,
02500         int2_t                    hef=120,
02501         nbox_t*                   universe=NULL);
02502 
02503     /**\brief Bulk-load a multi-dimensional index from a sorted stream source.
02504   * The storage manager does not provide
02505   * complete support for non-unique multidimensional indexes.
02506   * While you may insert multiple (distinct) entries for the same key in 
02507   * a multi-dimensional index, you will not be able to use them; only
02508   * the first can be retrieved.  
02509      * \ingroup SSMBULKLD
02510      * @param[in] stid  ID of the index to be loaded.
02511      * @param[in] sorted_stream  Input stream that is data source.
02512      * @param[out] stats  Statistics concerning the load activity will be
02513      *                     written here.
02514      * @param[in] hff   Heuristic fill factor. Not used.
02515      * @param[in] hef   Heuristic expansion factor. Not used.
02516      * @param[in] universe  Universal bounding box of all spatial objects indexed.
02517     */
02518     static rc_t            bulkld_md_index(
02519         const stid_t&             stid, 
02520         sort_stream_i&            sorted_stream,
02521         sm_du_stats_t&            stats,
02522         int2_t                    hff=75,
02523         int2_t                    hef=120,
02524         nbox_t*                   universe=NULL);
02525 
02526     /**\brief Print a representation of the rtree.
02527      * \ingroup SSMRTREE
02528      * @param[in] stid  ID of the index to be printed.
02529      * @param[in] out   I/O stream to which to write the output.
02530     */
02531     static rc_t            print_md_index(stid_t stid, ostream &out);
02532 
02533     /**\brief Look up an entry in a multi-dimensional index.
02534      * \ingroup SSMRTREE
02535      *
02536      * @param[in] stid  ID of the index. 
02537      * @param[in] key   Key associated with the entry to look up.
02538      * @param[out] el   Element associated with the given key will be copied into this buffer.
02539      * @param[in] elen Length of buffer into which the 
02540      *                  result will be written. If too small, eRECWONTFIT will
02541      *                  be returned.
02542      *                 Length of result will be returned here.
02543      * @param[out] found   True if an entry is found.
02544   *
02545      * If the index is not unique (allows duplicates), the first
02546      * element found with the given key will be returned.
02547      *
02548      * The storage manager does not provide a method to locate all 
02549   * entries associated with a non-unique key.
02550      */
02551     static rc_t            find_md_assoc(
02552         stid_t                    stid, 
02553         const nbox_t&             key, 
02554         void*                     el, 
02555         smsize_t&                 elen, 
02556         bool&                     found);
02557 
02558     /**\brief Create an entry in a multi-dimensional index.
02559   * The storage manager does not provide
02560   * complete support for non-unique multidimensional indexes.
02561   * While you may insert multiple (distinct) entries for the same key in 
02562   * a multi-dimensional index, you will not be able to use them; only
02563   * the first can be retrieved.  
02564      * \ingroup SSMRTREE
02565      *
02566      * @param[in] stid  ID of the index. 
02567      * @param[in] key  Key for the association to be created.
02568      * @param[in] el  Element for the association to be created.
02569     */
02570     static rc_t            create_md_assoc(
02571         stid_t                    stid, 
02572         const nbox_t&             key,
02573         const vec_t&              el);
02574 
02575     /**\brief Destroy an entry in a multi-dimensional index.
02576      * \ingroup SSMRTREE
02577      *
02578      * @param[in] stid  ID of the index. 
02579      * @param[in] key   Key of the entry to be removed.
02580      * @param[in] el   Element (value) of the entry to be removed.
02581     */
02582     static rc_t            destroy_md_assoc(
02583         stid_t                    stid, 
02584         const nbox_t&             key,
02585         const vec_t&              el);
02586 
02587     /**\cond skip */
02588     // for debugging
02589     static rc_t            draw_rtree(const stid_t& stid, ostream &);
02590     /**\endcond skip */
02591 
02592     /**\brief Gather usage statistics about an R*-Tree index.
02593      * \ingroup SSMRTREE
02594      * @param[in] stid  ID of the index. 
02595      * @param[out] stat  Usage statistics will be written here.
02596      * @param[in] size  Number of uint2_t's in the array ovp.
02597      * @param[out] ovp   Pre-allocated array of integers into which
02598      * the method will write the overlap percentages for each level of the
02599      * tree.
02600      * @param[in] audit If "true", the method 
02601      * will check assertions about the
02602      * correctness of the rtree.
02603      * If the audit fails an internal fatal error is generated 
02604      * to facilitate debugging. (It will generate a core file if your
02605      * shell permits such.)
02606      *
02607      * \note for debugging
02608     */
02609     static rc_t            rtree_stats(
02610         const stid_t&             stid,
02611         rtree_stats_t&            stat,
02612         uint2_t                   size = 0,
02613         uint2_t*                  ovp = NULL,
02614         bool                      audit = false);
02615 
02616     /**\addtogroup SSMFILE 
02617      * You can create, destroy, and scan files of records. You may exert some
02618      * control over the order in which records appear in the file (a physical
02619      * scan), but, in general, the storage manager decides where to put records.
02620      *
02621      * Pages in a file are slotted pages: Each page contains an array of
02622      * slots.
02623      * Records take one of three forms: small, large, and very large.
02624      * - Small records fit in the slots on the file pages.
02625      * - Large records are too big to fit on a slotted page, so they are put
02626      * elsewhere, and the slots point to these records.  Actually, what is
02627      * in a slot is a small array of page pointers to the data of the large record.
02628      * - A very large record is one whose slot in the file page contains
02629      *   a single reference to a page that is an index of data pages.
02630      *
02631      * Because records may take these forms, the API for creating records
02632      * contains the opportunity for you to provide a hint about the ultimate
02633      * size of the record so that the storage manager can create the proper
02634      * structure for the record immediately, rather than creating a small
02635      * record that is soon to be converted to a large, then a very large record
02636      * by subsequent appends. 
02637      *
02638      * All records contain a client-defined header.  This is for the convenience
02639      * of server-writers.  The header must fit on the slotted page, so it should
02640      * never be very large.
02641      *
02642      * The following methods manipulate files of records and the records found 
02643      * there.
02644      *
02645      * Modules below describe file traversal and
02646      * appending to files (\ref SSMSCANF), 
02647      * and pinning individual records in the buffer pool for extended operations 
02648      * (\ref SSMPIN).
02649      *
02650      * \section UNINIT Uninitialized Data
02651      * The functions create_rec, append_rec, and update_rec can be used to
02652      * write blocks of data that are all zeroes,  with minimal logging. 
02653      * This is useful for creating records of known size but with uninitialized data.  
02654      * The type zvec_t, a special case of vec_t, is for this purpose. 
02655      * Construct it with only a size, as follows:
02656      * \code
02657      * zvec_t zdata(100000);
02658      * \endcode
02659      * The underlying logging code recognizes that this is a vector of zeroes and
02660      * logs only a count, not the data themselves. 
02661      *
02662      * \section Errors
02663      * If an error occurs in the middle of one of these methods that is updating persistent data,
02664      * the record or file \e could be in an inconsistent state. 
02665      * The caller has the choice of aborting the transaction or rolling back to the nearest savepoint (see \ref SSMXCT).
02666      *
02667      * \sa SSMSCAN, SSMPIN, vec_t, zvec_t, IDs.
02668      */
02669     
02670     /**\brief Create a file of records.
02671      * \ingroup SSMFILE
02672      * \details
02673      * @param[in] vid   Volume on which to create a file.
02674      * @param[out] fid  Returns (store) ID of the new file here.
02675      * @param[in] property Give the file the this property.
02676      * @param[in] cluster_hint Not used. 
02677      *
02678      * The cluster hint is included in the API for future use. 
02679      * It has no effect.
02680      */
02681     static rc_t            create_file( 
02682         vid_t                   vid, 
02683         stid_t&                 fid,
02684         store_property_t        property,
02685         shpid_t                 cluster_hint = 0
02686     ); 
02687 
02688     /**\brief Destroy a file of records.
02689      * \ingroup SSMFILE
02690      * \details
02691      * @param[in] fid  ID of the file to destroy.
02692      */
02693     static rc_t            destroy_file(const stid_t& fid); 
02694 
02695     /**\brief Create a new record.
02696      * \ingroup SSMFILE
02697      * \details
02698      * @param[in] fid  ID of the file in which to create a record.
02699      * @param[in] hdr  What to put in the record's header.
02700      * @param[in] len_hint  Hint about how big the record will ultimately be.
02701      * This is used to determine the initial format of the record. If you plan
02702      * to append to the record and know that it will ultimately become a large
02703      * record, it is more efficient to give a size hint that is larger than
02704      * a page here. Otherwise, the record will be made small (as determined by
02705      * the size of the parameter \a data ), and subsequent appends will cause 
02706      * the record to be converted to a large record.
02707      * @param[in] data  What to put in the record's body. 
02708      * @param[out] new_rid  ID of the newly created record.
02709      * @param[in] policy  File compaction policy to use. See \ref pg_policy_t
02710      * for possible values.
02711      */
02712     static rc_t            create_rec(
02713         const stid_t&            fid, 
02714         const vec_t&             hdr, 
02715         smsize_t                 len_hint, 
02716         const vec_t&             data, 
02717         rid_t&                   new_rid,
02718 #ifdef SM_DORA
02719         const bool               bIgnoreLocks = false,
02720 #endif
02721         uint4_t                  policy = t_cache | t_compact | t_append
02722     ); 
02723 
02724     /**\brief Destroy a record.
02725      * \ingroup SSMFILE
02726      * \details
02727      * @param[in] rid  ID of the record to destroy.
02728      */
02729     static rc_t            destroy_rec(const rid_t& rid
02730 #ifdef SM_DORA
02731         , const bool             bIgnoreLocks = false
02732 #endif
02733                                        );
02734 
02735     /**\brief Modify the body of an existing record.
02736      * \ingroup SSMFILE
02737      * \details
02738      * @param[in] rid  ID of the record to modify.
02739      * @param[in] start  First byte to change.
02740      * @param[in] data  What to put in the record's body.  
02741      *
02742      * This overwrites
02743      * the existing bytes, starting at the offset \a start through the
02744      * byte at \a start + \a data.size().
02745      * This method \b cannot \b be \b used to change the size of a record.
02746      * Attempting this will result in an error.
02747      */
02748     static rc_t            update_rec(
02749         const rid_t&             rid, 
02750         smsize_t                 start, 
02751         const vec_t&             data);
02752 
02753     /**\brief Modify the header of an existing record.
02754      * \ingroup SSMFILE
02755      * \details
02756      * @param[in] rid  ID of the record to modify.
02757      * @param[in] start  First byte to change.
02758      * @param[in] hdr  What to put in the record's header.  
02759      *
02760      * This overwrites
02761      * the existing bytes, starting at the offset \a start through the
02762      * byte at \a start + \a data.size().
02763      * This method \b cannot \b be \b used to change the size of a record
02764      * header. There are no methods for appending to or truncating a
02765      * record header.
02766      *
02767      * \sa pin_i::update_rec, \ref SSMPIN
02768      */
02769     static rc_t            update_rec_hdr(
02770         const rid_t&             rid, 
02771         smsize_t                 start, 
02772         const vec_t&             hdr);
02773     // see also pin_i::update_rec*()
02774 
02775     /**\brief Append bytes to a record body.
02776      * \ingroup SSMFILE
02777      * \details
02778      * @param[in] rid  ID of the record to modify.
02779      * @param[in] data  What to append to the record.
02780      *
02781      * \note This appends \b to a record; it does \b not append a record to a file!
02782      * \sa pin_i::append_rec, \ref SSMPIN
02783      */
02784     static rc_t            append_rec(
02785         const rid_t&             rid, 
02786         const vec_t&             data
02787                 );
02788 
02789     /**\brief Chop bytes off the end of a record body.
02790      * \ingroup SSMFILE
02791      * \details
02792      * @param[in] rid  ID of the record to modify.
02793      * @param[in] amount  How many bytes to lop off.
02794      *
02795      * \sa pin_i::truncate_rec, \ref SSMPIN
02796      */
02797     static rc_t            truncate_rec(
02798         const rid_t&             rid, 
02799         smsize_t                 amount
02800     );
02801 
02802     /**\brief Chop bytes off the end of a record body.
02803      * \ingroup SSMFILE
02804      * \details
02805      * @param[in] rid  ID of the record to modify.
02806      * @param[in] amount  How many bytes to lop off.
02807      * @param[out] should_forward  Returns true if the record started out
02808      * large but is now small as a result of the truncation.  
02809      * This enables a value-added server to take action in this event,
02810      * should it so desire.
02811      *
02812      * \sa pin_i::truncate_rec, \ref SSMPIN
02813      */
02814     static rc_t            truncate_rec(
02815         const rid_t&             rid, 
02816         smsize_t                 amount,
02817         bool&                    should_forward 
02818     );
02819 
02820 #ifdef OLDSORT_COMPATIBILITY
02821     typedef ssm_sort::key_info_t key_info_t;
02822 
02823     /* old sort physical version */
02824     /**\brief Sort a file. Deprecated.
02825      * \details
02826      */
02827     static rc_t            sort_file(
02828         const stid_t&             fid, 
02829         vid_t                     vid, 
02830         stid_t&                   sfid, 
02831         store_property_t          property,
02832         const key_info_t&         key_info, 
02833         int                       run_size,
02834         bool                      ascending = true,
02835         bool                      unique = false,
02836         bool                      destructive = false,
02837         bool                      use_new_sort = true);
02838 
02839     /**\brief Sort a file. Deprecated.
02840      * \details
02841      */
02842     static rc_t            new_sort_file(
02843         const stid_t&             fid, 
02844         vid_t                     vid, 
02845         stid_t&                   sfid, 
02846         store_property_t          property,
02847         const key_info_t&         key_info, 
02848         int                       run_size,
02849         bool                      ascending = true,
02850         bool                      unique = false,
02851         bool                      destructive = false
02852         );
02853 #endif /* OLDSORT_COMPATIBILITY */
02854 
02855     typedef ssm_sort::sort_keys_t sort_keys_t;
02856 
02857     /* new sort physical version : see notes below */
02858     /**\brief Sort a file.
02859      * \ingroup SSMSORT
02860      * @param[in] fid File to sort.
02861      * @param[in] sorted_fid File to which to write the results. 
02862      * @param[in] nvids Size of array \a vid.
02863      * @param[in] vid Array of IDs of scratch files created by the caller.
02864      * @param[in] kl See sort_keys_t.
02865      * @param[in] min_rec_sz Hint of minimum record size in input file.
02866      * @param[in] run_size Number of pages in buffer pool to use for a run. 
02867      * @param[in] temp_space Number of pages to use for scratch space.
02868      * (This limits the amount of memory used by the sort).
02869      *
02870      * \details
02871      * Before you call sort_file, you must create an output file \a sorted_fid
02872      * into which sort_file will write the results.
02873      *
02874      * The sort uses temporary files when the input file contains more records
02875      * than can fit in one run (determined by \a run_size). These temporary files
02876      * may be spread across multiple volumes, which is useful if the
02877      * volumes reside on different spindles.  The arguments \a nvids
02878      * and \a vid are for indicating the volumes to use for these scratch
02879      * files.
02880      *
02881      * The caller can provide a clue in \a min_rec_size
02882      * about the minimum record size of the
02883      * input file, which can help the sort's efficiency.
02884      *
02885      * The \a run_size indicates how many buffer-pool pages to use
02886      * for each run.
02887      * Since at all times one page is fixed for output, while the rest are 
02888      * for reading the input in runs, the real run size is \a run_size-1.
02889      * 
02890      */
02891     static rc_t            sort_file(
02892         const stid_t&            fid,     // input file
02893         const stid_t&            sorted_fid, // output file 
02894         int                      nvids,    // array size for vids
02895         const vid_t*             vid,     // array of vids for temp
02896                         // files
02897                         // created by caller--
02898                         // can be same as input file
02899         sort_keys_t&            kl, // kl &
02900         smsize_t                min_rec_sz, // for estimating space use
02901         int                     run_size,   // # pages to use for a run
02902         int                     temp_space // # pages VM to use for scratch 
02903     );
02904 
02905     /**\brief Return the short volume ID of a volume.
02906      * \ingroup SSMVOL
02907      *
02908      * @param[in] lvid Long (persistent) volume ID found on the volume's
02909      * header.
02910      * @param[out] vid Short volume ID of a mounted volume.
02911      */
02912     static rc_t            lvid_to_vid(
02913         const lvid_t&          lvid,
02914         vid_t&                 vid);
02915 
02916     /**\brief Return the long volume ID of a volume.
02917      * \ingroup SSMVOL
02918      *
02919      * @param[in] vid Short volume ID of a mounted volume.
02920      * @param[out] lvid Long (persistent) volume ID found on the volume's
02921      * header.
02922      */
02923     static rc_t            vid_to_lvid(
02924         vid_t                  vid,
02925         lvid_t&                lvid);
02926 
02927     /*****************************************************************
02928      * Locking related functions
02929      *
02930      * NOTE: there are standard conversions from lpid_t, rid_t, and
02931      *       stid_t to lockid_t, so wherever a lockid_t parameter is
02932      *         specified a lpid_t, rid_t, or stid_t can be used.
02933      *
02934      *****************************************************************/
02935 
02936 #ifdef SLI_HOOKS
02937     /* enable/disable SLI globally for all threads created after this
02938        point. Does *NOT* disable SLI for existing threads.
02939      */
02940     static void            set_sli_enabled(bool enabled);
02941     static void            set_elr_enabled(bool enabled);
02942 
02943     static rc_t            set_log_features(char const* features);
02944     static char const*         get_log_features();
02945 #endif
02946 
02947     /**\brief Acquire a lock.
02948      * \ingroup SSMLOCK
02949      * @param[in]  n  Lock id of the entity to lock. There are
02950      * conversions from record ids, volume ids, store ids, and page ids to
02951      * lockid_t.
02952      * @param[in]  m  Desired lock mode.  Values: EX, SH.
02953      * @param[in]  d  Desired duration.  Values: 
02954      * - t_very_long : Held across transaction boundaries; 
02955      *             cannot be released by unlock()
02956      * - t_long : Released at commit; cannot be released by unlock()
02957      * - t_medium : May be released early by explicit unlock()
02958      * - t_short  : May be released early by explicit unlock()
02959      * - t_instant : Not held: acquired and released immediately.  Useful
02960      *             to see if any other transaction holds an incompatible lock.
02961      * @param[in]  timeout  Milliseconds willing to block.  See timeout_in_ms.
02962      *
02963      * The lock manager is written with these durations in mind, but the
02964      * only durations used by the storage manager are t_instant and t_long.
02965      * Medium-duration locks are used internally in a one place.  
02966      *
02967      * Durations other than long and instant are not well-tested.
02968      */
02969     static rc_t            lock(
02970         const lockid_t&         n, 
02971         lock_mode_t             m,
02972         lock_duration_t         d = t_long,
02973         timeout_in_ms           timeout = WAIT_SPECIFIED_BY_XCT
02974     );
02975     
02976     /**\brief Release a lock.
02977      * \ingroup SSMLOCK
02978      * @param[in]  n  Lock id of the entity to lock. There are
02979      * conversions from record ids, volume ids, store ids, and page ids to
02980      * lockid_t.
02981      */
02982     static rc_t            unlock(const lockid_t& n);
02983 
02984     /**\brief  Disable lock escalation on the given entity. 
02985      * \ingroup SSMLOCK
02986      * @param[in]  n  Lock id of the entity to lock. There are
02987      * conversions from record ids, volume ids, store ids, and page ids to
02988      * lockid_t.
02989      * @param[in]  passOnToDescendants If true, apply this to the descendants
02990      * of \a n.
02991      */
02992     static rc_t            dont_escalate(
02993         const lockid_t&           n,
02994         bool                      passOnToDescendants = true
02995     );
02996 
02997     /**\brief  Find the storage-manager-wide escalation thresholds
02998      * \ingroup SSMLOCK
02999      * Default values (used for all transactions until they change
03000      * their per-transaction thresholds) are determined by the
03001      * storage-manager-wide options.
03002      * See \ref SSMOPT.
03003      */
03004     static rc_t            get_escalation_thresholds(
03005         w_base_t::int4_t&        toPage,
03006         w_base_t::int4_t&        toStore,
03007         w_base_t::int4_t&        toVolume);
03008 
03009     /**\brief  Change the storage-manager-wide escalation thresholds
03010      * \ingroup SSMLOCK
03011      * Default values (used for all transactions until they change
03012      * their per-transaction thresholds) are determined by the
03013      * storage-manager-wide options.
03014      * See \ref SSMOPT.
03015      */
03016     static rc_t            set_escalation_thresholds(
03017         w_base_t::int4_t       toPage,
03018         w_base_t::int4_t       toStore,
03019         w_base_t::int4_t       toVolume);
03020 
03021     /**\brief  Find out if the attached transaction has an entity locked.
03022      * \ingroup SSMLOCK
03023      * @param[in]  n  Lock id of the entity to lock. There are
03024      * conversions from record ids, volume ids, store ids, and page ids to
03025      * lockid_t.
03026      * @param[out]  m  Mode of lock held. NL if none.
03027      * @param[in]  implicit If "true" the query will returns a lock mode if
03028      * an implicit lock is held, otherwise the lock must be held explicitly.
03029      */
03030     static rc_t            query_lock(
03031         const lockid_t&        n, 
03032         lock_mode_t&           m,
03033         bool                   implicit = false
03034     );
03035 
03036     /*****************************************************************
03037      * Lock Cache related functions
03038      *
03039      * Each transaction has a cache of recently acquired locks
03040      * The following functions control the use of the cache.
03041      * Note that the functions affect the transaction currently
03042      * associated with the thread.
03043      *****************************************************************/
03044     // turn on(enable=true) or  off/(enable=false) the lock cache 
03045     // return previous state.
03046     /**\brief Control  lock caching for attached transaction.
03047      * \ingroup SSMLOCK
03048      *
03049      * @param[in] enable Set to true if you want to turn on lock caching
03050      * for the attached transaction.  The default is that it is turned on.
03051      *
03052      * Only long-duration locks are cached.
03053      * Lock caching can be turned off by default using the 
03054      * sm_lock_caching option.  Even with it turned off by default, it
03055      * can be turned on for a given transcation with this method.
03056      *
03057      */
03058     static rc_t            set_lock_cache_enable(bool enable);
03059 
03060     /**\brief True if lock cache is enabled for the attached transaction 
03061      * \ingroup SSMLOCK
03062      *
03063      * @param[out] enabled Will be set to true if the attached transaction has
03064      * lock caching enabled, false otherwise.
03065      */
03066     static rc_t            lock_cache_enabled(bool& enabled);
03067 
03068 private:
03069 
03070     static int _instance_cnt;
03071     static option_group_t* _options;
03072     static option_t* _hugetlbfs_path;
03073     static option_t* _reformat_log;
03074     static option_t* _prefetch;
03075     static option_t* _bufpoolsize;
03076     static option_t* _locktablesize;
03077     static option_t* _logdir;
03078     static option_t* _logsize;
03079     static option_t* _logbufsize;
03080     static option_t* _error_log;
03081     static option_t* _error_loglevel;
03082     static option_t* _lockEscalateToPageThreshold;
03083     static option_t* _lockEscalateToStoreThreshold;
03084     static option_t* _lockEscalateToVolumeThreshold;
03085     static option_t* _cc_alg_option;
03086     static option_t* _log_warn_percent;
03087     static option_t* _num_page_writers;
03088     static option_t* _logging;
03089     static option_t* _lock_caching_default;
03090 
03091 
03092     static rc_t            _set_option_logsize(
03093         option_t*              opt,
03094         const char*            value,
03095         ostream*               err_stream);
03096     
03097     static rc_t            _set_option_lock_escalate_to_page(
03098         option_t*              opt,
03099         const char*            value,
03100         ostream*               err_stream);
03101     
03102     static rc_t            _set_option_lock_escalate_to_store(
03103         option_t*              opt,
03104         const char*            value,
03105         ostream*               err_stream);
03106     
03107     static rc_t            _set_option_lock_escalate_to_volume(
03108         option_t*              opt,
03109         const char*            value,
03110         ostream*               err_stream);
03111     
03112     static rc_t            _set_store_property(
03113         stid_t                stid,
03114         store_property_t      property);
03115 
03116     static rc_t            _get_store_property(
03117         stid_t                stid,
03118         store_property_t&     property);
03119 
03120     static rc_t         _begin_xct(
03121         sm_stats_info_t*      stats,  // allocated by caller
03122         tid_t&                tid, 
03123         timeout_in_ms         timeout);
03124 
03125     static rc_t            _commit_xct(
03126         sm_stats_info_t*&     stats,
03127         bool                  lazy,
03128         lsn_t* plastlsn);
03129 
03130     static rc_t            _commit_xct_group(
03131   xct_t *               list[],
03132   int                   listlen);
03133 
03134     static rc_t            _prepare_xct(
03135         sm_stats_info_t*&     stats,
03136         vote_t&                v);
03137 
03138     static rc_t            _set_coordinator(const server_handle_t &); 
03139     
03140     static rc_t            _enter_2pc(const gtid_t &); 
03141     static rc_t            _force_vote_readonly(); 
03142     static rc_t            _recover_2pc(const gtid_t &,// in
03143                                 bool    mayblock,
03144                                 tid_t    &    //out -- attached if found(?)
03145                             );
03146     static rc_t            _chain_xct(
03147         sm_stats_info_t*&      stats,
03148         bool                   lazy);
03149 
03150     static rc_t            _abort_xct(
03151         sm_stats_info_t*&      stats);
03152 
03153     static rc_t            _save_work(sm_save_point_t& sp);
03154 
03155     static rc_t            _rollback_work(const sm_save_point_t&        sp);
03156     static rc_t            _mount_dev(
03157         const char*            device,
03158         u_int&                 vol_cnt,
03159         vid_t                  local_vid);
03160 
03161     static rc_t            _dismount_dev(
03162         const char*            device,
03163         bool                   dismount_if_locked = true
03164     );
03165     static rc_t            _create_vol(
03166         const char*            device_name,
03167         const lvid_t&          lvid,
03168         smksize_t              quota_KB,
03169         bool                   skip_raw_init,
03170         const bool             apply_fake_io_latency,
03171         const int              fake_disk_latency);
03172 
03173     static rc_t            _create_index(
03174         vid_t                 vid, 
03175         ndx_t                 ntype, 
03176         store_property_t      property,
03177         const char*           key_desc,
03178         concurrency_t         cc,
03179         stid_t&               stid
03180     );
03181 
03182     static rc_t            _destroy_index(const stid_t& iid); 
03183 
03184     static rc_t            _get_store_info( 
03185         const stid_t  &       stid, 
03186         sm_store_info_t&      info);
03187 
03188     static rc_t            _bulkld_index(
03189         const stid_t&         stid,
03190         int                   nsrcs,
03191         const stid_t*         source,
03192         sm_du_stats_t&        stats,
03193         bool                  sort_duplicates = true,
03194         bool                  lexify_keys = true
03195     );
03196 
03197     static rc_t            _bulkld_index(
03198         const stid_t&          stid, 
03199         sort_stream_i&         sorted_stream,
03200         sm_du_stats_t&         stats
03201     );
03202 
03203     static rc_t            _print_index(const stid_t &iid);
03204 
03205     static rc_t            _create_assoc(
03206         const stid_t  &        stid, 
03207         const vec_t&           key, 
03208         const vec_t&           el
03209 #ifdef SM_DORA
03210         , const bool             bIgnoreLocks = false
03211 #endif
03212     );
03213 
03214     static rc_t            _destroy_assoc(
03215         const stid_t &        stid, 
03216         const vec_t&          key,
03217         const vec_t&          el
03218 #ifdef SM_DORA
03219         , const bool             bIgnoreLocks = false
03220 #endif
03221     );
03222 
03223     static rc_t            _destroy_all_assoc(
03224         const stid_t&        stid, 
03225         const vec_t&         key,
03226         int&                 num_removed
03227     );
03228     static rc_t            _find_assoc(
03229         const stid_t&        stid, 
03230         const vec_t&         key, 
03231         void*                el, 
03232         smsize_t&            elen, 
03233         bool&                found
03234 #ifdef SM_DORA
03235         , const bool             bIgnoreLocks = false
03236 #endif
03237     );
03238 
03239     // below method overloaded for rtree
03240     static rc_t            _create_md_index(
03241         vid_t                 vid, 
03242         ndx_t                 ntype, 
03243         store_property_t      property,
03244         stid_t&               stid, 
03245         int2_t                dim=2
03246     );
03247 
03248     static rc_t            _destroy_md_index(const stid_t& iid);
03249 
03250     static rc_t            _destroy_md_assoc(
03251         stid_t                stid,
03252         const nbox_t&         key,
03253         const vec_t&          el);
03254 
03255     static rc_t            _bulkld_md_index(
03256         const stid_t&         stid, 
03257         int                   nsrcs,
03258         const stid_t*         source, 
03259         sm_du_stats_t&        stats,
03260         int2_t                hff,           // for rtree only
03261         int2_t                hef,           // for rtree only
03262         nbox_t*               universe);// for rtree only
03263 
03264     static rc_t            _bulkld_md_index(
03265         const stid_t&         stid, 
03266         sort_stream_i&        sorted_stream,
03267         sm_du_stats_t&        stats,
03268         int2_t                hff,           // for rtree only
03269         int2_t                hef,           // for rtree only
03270         nbox_t*               universe);// for rtree only
03271 
03272     static rc_t            _print_md_index(stid_t stid, ostream &);
03273 
03274     static rc_t            _create_md_assoc(
03275         stid_t                stid, 
03276         const nbox_t&         key,
03277         const vec_t&          el);
03278 
03279     static rc_t            _find_md_assoc(
03280         stid_t                stid, 
03281         const nbox_t&         key, 
03282         void*                 el, 
03283         smsize_t&             elen, 
03284         bool&                 found);
03285 
03286     //
03287     // The following functions deal with files of records.
03288     //
03289     static rc_t            _destroy_n_swap_file(
03290         const stid_t&         old_fid,
03291         const stid_t&         new_fid);
03292 
03293     static rc_t            _create_file(
03294         vid_t                 vid, 
03295         stid_t&               fid,
03296         store_property_t     property,
03297         shpid_t              cluster_hint = 0
03298     ); 
03299 
03300     static rc_t            _destroy_file(const stid_t& fid); 
03301 
03302     static rc_t            _create_rec(
03303         const stid_t&            fid, 
03304         const vec_t&             hdr, 
03305         smsize_t                 len_hint, 
03306         const vec_t&             data, 
03307         rid_t&                   new_rid,
03308         uint4_t                  policy 
03309 #ifdef SM_DORA
03310         , const bool             bIgnoreLocks = false
03311 #endif
03312         ); 
03313 
03314     static rc_t            _destroy_rec(
03315         const rid_t&             rid
03316 #ifdef SM_DORA
03317         , const bool             bIgnoreLocks = false
03318 #endif
03319         );
03320 
03321     static rc_t            _update_rec(
03322         const rid_t&             rid, 
03323         smsize_t                 start, 
03324         const vec_t&             data
03325 #ifdef SM_DORA
03326         , const bool             bIgnoreLocks = false
03327 #endif
03328         );
03329 
03330     static rc_t            _update_rec_hdr(
03331         const rid_t&             rid, 
03332         smsize_t                 start, 
03333         const vec_t&             hdr
03334 #ifdef SM_DORA
03335         , const bool             bIgnoreLocks = false
03336 #endif
03337         );
03338 
03339     static rc_t            _append_rec(
03340         const rid_t&             rid, 
03341         const vec_t&             data
03342         );
03343 
03344     static rc_t            _truncate_rec(
03345             const rid_t&         rid, 
03346             smsize_t             amount,
03347             bool&                should_forward
03348         );
03349 
03350     static rc_t            _draw_rtree(const stid_t& stid, ostream &);
03351 
03352     static rc_t            _rtree_stats(
03353             const stid_t&       stid,
03354             rtree_stats_t&      stat,
03355             uint2_t             size,
03356             uint2_t*            ovp,
03357             bool                audit
03358         );
03359 
03360 #ifdef OLDSORT_COMPATIBILITY
03361     /* old sort internal, physical */
03362     static rc_t            _sort_file(
03363         const stid_t&           fid, 
03364         vid_t                   vid, 
03365         stid_t&                 sfid, 
03366         store_property_t        property,
03367         const key_info_t&       key_info, 
03368         int                     run_size,
03369         bool                    ascending,
03370         bool                    unique,
03371         bool                    destructive
03372     );
03373 #endif /* OLDSORT_COMPATIBILITY */
03374 
03375     /* new sort internal, physical */
03376     static rc_t            _sort_file(
03377         const stid_t&             fid,     // input file
03378         const stid_t&             sorted_fid, // output file -- 
03379                         // created by caller--
03380                         // can be same as input file
03381         int                      nvids,    // array size for vids
03382         const vid_t*             vid,     // array of vids for temp
03383         sort_keys_t&             kl,     // key location info &
03384         smsize_t                 min_rec_sz, // for estimating space use
03385         int                      run_size,   // # pages to use for a run
03386         int                      temp_space //# pages VM to use for scratch 
03387     );
03388 
03389 
03390 #ifdef OLDSORT_COMPATIBILITY
03391     /* internal compatibility old sort-> new sort */
03392     static rc_t            _new_sort_file(
03393             const stid_t&         in_fid, 
03394             const stid_t&         out_fid, 
03395             const key_info_t&    ki, 
03396             int                  run_size,
03397             bool                  ascending, 
03398             bool                  unique, 
03399             bool                  keep_orig //!destructive
03400             ); 
03401 #endif /* OLDSORT_COMPATIBILITY */
03402 
03403     static store_flag_t     _make_store_flag(store_property_t property);
03404     // reverse function:
03405     // static store_property_t    _make_store_property(w_base_t::uint4_t flag);
03406     // is in dir_vol_m
03407 
03408     // this is for df statistics  DU DF
03409     static rc_t            _get_du_statistics(
03410         vid_t                  vid, 
03411         sm_du_stats_t&         du,
03412         bool                   audit);
03413 
03414     static rc_t            _get_du_statistics(
03415         const stid_t  &        stid, 
03416         sm_du_stats_t&         du,
03417         bool                   audit);
03418 
03419     static rc_t            _get_volume_meta_stats(
03420         vid_t                  vid,
03421         SmVolumeMetaStats&     volume_stats,
03422         concurrency_t          cc);
03423 
03424     static rc_t            _get_file_meta_stats(
03425         vid_t                  vid,
03426         w_base_t::uint4_t      num_files,
03427         SmFileMetaStats*       file_stats,
03428         bool                   batch_calculate,
03429         concurrency_t          cc);
03430 };
03431 
03432 /**\brief Information about a store that can be queried by the client.
03433  * \details
03434  * This information is stored in a store directory on the volume.
03435  * It can be queried with ss_m::get_store_info.
03436  */
03437 class sm_store_info_t {
03438 public:
03439     NORET sm_store_info_t(int len) :
03440                 store(0), stype(ss_m::t_bad_store_t), 
03441                 ntype(ss_m::t_bad_ndx_t), cc(ss_m::t_cc_bad),
03442                 eff(0), large_store(0), root(0),
03443                 nkc(0), keydescrlen(len)
03444                 {  keydescr = new char[len]; }
03445 
03446     NORET ~sm_store_info_t() { if (keydescr) delete[] keydescr; }
03447 
03448     /// store number
03449     snum_t    store;        
03450     /// t_index, t_file, ... See ss_m::store_t.
03451     u_char    stype;        
03452     /// t_btree, t_rtree,... See ss_m::ndx_t
03453     u_char    ntype;        
03454     /// t_cc_kvl, t_cc_record,... See ss_m::concurrency_t
03455     u_char    cc;         
03456 
03457     /// Unused:
03458     u_char    eff;        
03459 
03460     /// Store number for associated large-page store, if there is one.
03461     snum_t    large_store; 
03462     /// Root page if this is an index.
03463     shpid_t    root;        
03464     /// Number of key components if this is an index.
03465     w_base_t::uint4_t    nkc;  
03466     /// Size of key description (if this is an index)
03467     int        keydescrlen;    
03468     /**\brief Variable length string.
03469      *
03470      * He who creates a sm_store_info_t for use with get_store_info()
03471      * is responsible for allocating enough space for 
03472      * key descriptors if he expects to find them.
03473      * See \ref key_description.
03474      */
03475     char        *keydescr;    
03476 };
03477 
03478 
03479 ostream& operator<<(ostream& o, const vid_t& v);
03480 istream& operator>>(istream& i, vid_t& v);
03481 ostream& operator<<(ostream& o, const extid_t& x);
03482 istream& operator>>(istream& o, extid_t &x);
03483 ostream& operator<<(ostream& o, const stid_t& stid);
03484 istream& operator>>(istream& i, stid_t& stid);
03485 ostream& operator<<(ostream& o, const lpid_t& pid);
03486 istream& operator>>(istream& i, lpid_t& pid);
03487 ostream& operator<<(ostream& o, const shrid_t& r);
03488 istream& operator>>(istream& i, shrid_t& r);
03489 ostream& operator<<(ostream& o, const rid_t& rid);
03490 istream& operator>>(istream& i, rid_t& rid);
03491 ostream& operator<<(ostream& o, const sm_stats_info_t& s);
03492 template<class ostream>
03493 ostream& operator<<(ostream& o, const sm_config_info_t& s)
03494 {
03495     o    << "  page_size " << s.page_size
03496      << "  max_small_rec " << s.max_small_rec
03497      << "  lg_rec_page_space " << s.lg_rec_page_space
03498      << "  buffer_pool_size " << s.buffer_pool_size
03499      << "  max_btree_entry_size " << s.max_btree_entry_size
03500      << "  exts_on_page " << s.exts_on_page
03501      << "  pages_per_ext " << s.pages_per_ext
03502      << "  logging " << s.logging
03503       ;
03504     return o;
03505 }
03506 
03507 
03508 #ifndef VEC_T_H
03509 #include <vec_t.h>
03510 #endif
03511 
03512 #ifndef SM_ESCALATION_H
03513 #include <sm_escalation.h>
03514 #endif
03515 
03516 /*<std-footer incl-file-exclusion='SM_H'>  -- do not edit anything below this line -- */
03517 
03518 #endif          /*</std-footer>*/

Generated on Thu Dec 9 08:42:27 2010 for Shore Storage Manager by  doxygen 1.4.7