00001 /* -*- mode:C++; c-basic-offset:4 -*- 00002 Shore-MT -- Multi-threaded port of the SHORE storage manager 00003 00004 Copyright (c) 2007-2009 00005 Data Intensive Applications and Systems Labaratory (DIAS) 00006 Ecole Polytechnique Federale de Lausanne 00007 00008 All Rights Reserved. 00009 00010 Permission to use, copy, modify and distribute this software and 00011 its documentation is hereby granted, provided that both the 00012 copyright notice and this permission notice appear in all copies of 00013 the software, derivative works or modified versions, and any 00014 portions thereof, and that both notices appear in supporting 00015 documentation. 00016 00017 This code is distributed in the hope that it will be useful, but 00018 WITHOUT ANY WARRANTY; without even the implied warranty of 00019 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS 00020 DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER 00021 RESULTING FROM THE USE OF THIS SOFTWARE. 00022 */ 00023 00024 /*<std-header orig-src='shore' incl-file-exclusion='SM_H'> 00025 00026 $Id: sm.h,v 1.322 2010/10/27 17:04:23 nhall Exp $ 00027 00028 SHORE -- Scalable Heterogeneous Object REpository 00029 00030 Copyright (c) 1994-99 Computer Sciences Department, University of 00031 Wisconsin -- Madison 00032 All Rights Reserved. 00033 00034 Permission to use, copy, modify and distribute this software and its 00035 documentation is hereby granted, provided that both the copyright 00036 notice and this permission notice appear in all copies of the 00037 software, derivative works or modified versions, and any portions 00038 thereof, and that both notices appear in supporting documentation. 00039 00040 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY 00041 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS 00042 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND 00043 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 00044 00045 This software was developed with support by the Advanced Research 00046 Project Agency, ARPA order number 018 (formerly 8230), monitored by 00047 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518. 00048 Further funding for this work was provided by DARPA through 00049 Rome Research Laboratory Contract No. F30602-97-2-0247. 00050 00051 */ 00052 00053 #ifndef SM_H 00054 #define SM_H 00055 00056 #include "w_defines.h" 00057 00058 /* -- do not edit anything above this line -- </std-header>*/ 00059 00060 /* 00061 * Stuff needed by value-added servers. NOT meant to be included by 00062 * internal SM .c files, except to the extent that they need these 00063 * definitions used in the API. 00064 */ 00065 00066 #ifdef __GNUG__ 00067 #pragma interface 00068 #endif 00069 00070 #ifndef SM_INT_4_H 00071 #include <sm_int_4.h> 00072 #endif 00073 00074 #ifndef SM_DU_STATS_H 00075 #include <sm_du_stats.h> // declares sm_du_stats_t 00076 #endif 00077 00078 #ifndef SM_STATS_H 00079 #include <smstats.h> // declares sm_stats_info_t and sm_config_info_t 00080 #endif 00081 00082 #ifndef SM_S_H 00083 #include <sm_s.h> // declares key_type_s, rid_t, lsn_t 00084 #endif 00085 00086 #ifndef LEXIFY_H 00087 #include <lexify.h> // declares sortorder with constants 00088 #endif 00089 00090 #ifndef NBOX_H 00091 #include <nbox.h> // key_info_t contains nbox_t 00092 #endif /* NBOX_H */ 00093 00094 #ifndef SORT_S_H 00095 #include <sort_s.h> // declares key_info_t 00096 #endif 00097 00098 /* DOXYGEN Documentation : */ 00099 00100 /**\addtogroup LOGSPACE 00101 * 00102 * Updates performed by transactions are logged so that 00103 * the can be rolled back (in the event of a transaction abort) 00104 * or restored (in the event of a crash). Both the old and new values 00105 * of an updated location are logged. This allows a steal, no-force 00106 * buffer management policy, which means the buffer manager is free 00107 * to write dirty pages to disk at any time and yet does not have 00108 * to write dirty pages for a a transaction to commit. 00109 * 00110 * The log is stored in a set of Unix files, all in the same directory, 00111 * whose path is determined by a run-time option. 00112 * The maximum size of the log is also determined by a run-time option.o 00113 * The proper value of the log size depends on 00114 * the expected transaction mix. More specifically, it depends on the 00115 * age of the oldest (longest running) transaction in the system and 00116 * the amount of log space used by all active transactions. Here are 00117 * some general rules to determine the amount of free log space 00118 * available in the system. 00119 * - Log records between the first log 00120 * record generated by the oldest active transaction and the most 00121 * recent log record generated by any transaction cannot be thrown 00122 * away. 00123 * - Log records from a transaction are no longer needed 00124 * once the transaction has committed or completely aborted and all 00125 * updates have made it to disk. Aborting a transaction causes log space 00126 * to be used, so space is reserved for aborting each transaction. 00127 * Enough log space must be available to commit or abort all active 00128 * transactions at all times. 00129 * 00130 * - Only space starting at the beginning of the log can be reused. 00131 * This space can be reused if it contains log records only for 00132 * transactions meeting the previous rule. 00133 * 00134 * - All storage manager calls that update records require log space twice 00135 * the size of the space updated in the record. All calls that create, 00136 * append, or truncate records require log space equal to the size 00137 * created, inserted, or deleted. Log records generated by these calls 00138 * (generally one per call) have an overhead of approximately 50 bytes. 00139 * 00140 * - The amount of log space reserved for aborting a transaction is equal to 00141 * the amount of log space generated by the transaction plus a fudge 00142 * factor. 00143 * (Where btrees are concerned, a structure modification 00144 * might be necessary on abort, using more space on abort, or might not be 00145 * necessary on abort where it was done during forward processing, 00146 * using less space on abort.) 00147 * 00148 * - The transaction assumes responsiblity for reserving space in the 00149 * log so that it can abort, should it need to (without leaving an 00150 * unrecoverable volume). The transaction and the log cooperate to 00151 * reserve space for the transaction's aborting. 00152 * 00153 * - When insufficient log space is available for a transaction, the 00154 * transaction is (may be, depending on the server) aborted. 00155 * The storage manager will return an error indication (out of log space) 00156 * if it is unable to insert a log record into the log due to 00157 * insufficient space. 00158 * 00159 * Checkpoints are taken periodically by the storage manager in order to 00160 * free log space and shorten recovery time. Checkpoints are "fuzzy" 00161 * and can do not require the system to pause while they are completing. 00162 * 00163 * See the storage manager constructor ss_m::ss_m for more information 00164 * about handling out-of-logspace conditions. 00165 * 00166 */ 00167 00168 /**\addtogroup SSMOPT 00169 * 00170 * These are the run-time options for the storage manager. 00171 * 00172 * -sm_bufpoolsize : 00173 * - type: number 00174 * - description: This is the size of 00175 * the buffer pool in Kb. Must be large enough to hold at least 32 pages, 00176 * so it depends on the configured page size. 00177 * - default: none 00178 * - required?: yes 00179 * 00180 * -sm_hugetlbfs_path 00181 * - type: string (full absolute path name) 00182 * - description: Needed only if you configured --with-hugetlbfs. 00183 * - default: see \ref CONFIGOPT 00184 * - required?: no 00185 * 00186 * -sm_reformat_log 00187 * - type: Boolean 00188 * - description: If "yes", your log will be clobbered and the storage 00189 * manager will start up with an entirely new log. 00190 * - default: no 00191 * - required?: no 00192 * 00193 * -sm_logdir 00194 * - type: string (relative or absolutee path name) 00195 * - description: Location of the log files. 00196 * - default: none 00197 * - required?: yes 00198 * 00199 * -sm_logbufsize 00200 * - type: number 00201 * - description: size of log buffer in KB. 00202 * Must be greater than or equal to the larger of 00203 * (4 times the page size, 64 Kb) 00204 * and less than or equal to 00205 * 128 times the page_size. This is the size of 00206 * the log buffer in Kb. 00207 * - default: 128 00208 * - required?: no 00209 * 00210 * -sm_logsize 00211 * - type: number 00212 * - description: greater than or equal to 8256 00213 * This is the maximum size of the log in Kb. It is a function of 00214 * the log buffer size, and the default is the minimum allowable for 00215 * the default sm_logbufsize. 00216 * - default: 128 00217 * - required?: yes 00218 * 00219 * -sm_log_warn 00220 * - type: number between 0 and 100 (percentage) 00221 * - description: percentage of log that, when consumed by active 00222 * transactions, triggers a callback warning of potential inability 00223 * to roll back. Should be less than 50. 00224 * - default: 45 00225 * - required?: no 00226 * 00227 * -sm_errlog 00228 * - type: string (relative or absolute path name OR - ) 00229 * - description: Destination for error messages. If "-" is given, 00230 * the destination is stderr. 00231 * - default: \b - 00232 * - required?: no 00233 * 00234 * -sm_errlog_level 00235 * - type: string (one of none|emerg|fatal|internal|error|warning|info|debug) 00236 * - description: filter. Message of this priority or higher are issued to 00237 * the error log; messages with lower priority are not issued. 00238 * The priorities are listed from high to low. "none" means no logging 00239 * will happen. 00240 * - default: error 00241 * - required?: no 00242 * 00243 * -sm_locktablesize : 00244 * - type: number greater than or equal to 64 00245 * - description: size of lock manager's hash table will be a prime 00246 * number near and greater than the given number. 00247 * - default: 64000 (yields a hash table with 65521 buckets) 00248 * - required?: no 00249 * 00250 * -sm_lock_escalate_to_page_threshold 00251 * - type: number greater than or equal to 0 00252 * - description: after acquiring this many record locks on a page, the lock 00253 * will be escalated to a page lock. A value of 0 disables escalation to a 00254 * page lock. 00255 * - default: 5 00256 * - required?: no 00257 * 00258 * -sm_lock_escalate_to_store_threshold 00259 * - type: number greater than or equal to 0 00260 * - description: after acquiring this many page locks on in a store, 00261 * the lock will be escalated to a store lock. 00262 * A value of 0 disables escalation to a store lock. 00263 * - default: 25 00264 * - required?: no 00265 * 00266 * -sm_lock_escalate_to_volume_threshold 00267 * - type: number greater than or equal to 0 00268 * - description: after acquiring this many store locks on in a volume, 00269 * the lock will be escalated to a volume lock. 00270 * A value of 0 disables escalation to a volume lock. 00271 * - default: 0 00272 * - required?: no 00273 * 00274 * -sm_cc_alg 00275 * - type: string (one of file | page | record | none) 00276 * - description: default locking granularity for file operations. 00277 * This can be overridden on a per-transaction basis with 00278 * ss_m::set_xct_lock_level(). 00279 * - default: record 00280 * - required?: no 00281 * 00282 * -sm_backgroundflush 00283 * - type: Boolean 00284 * - description: Enables background-flushing of volumes. 00285 * Must be set to "yes" for sm_num_page_writers to have any effect. 00286 * - default: yes 00287 * - required?: no 00288 * 00289 * -sm_num_page_writers 00290 * - type: number 00291 * - description: greater than or equal to 0; this is the number of 00292 * background-flushing threads for each volume. If you have 00293 * lots of threads, 00294 * a huge buffer pool, and few volumes, you should increase this. 00295 * If sm_backgroundflush is "no", this value is ignored. 00296 * - default: 2 00297 * - required?: no 00298 * 00299 * -sm_prefetch 00300 * - type: Boolean 00301 * - description: Enables prefetching for scans. 00302 * - default: no 00303 * - required?: no 00304 * 00305 * -sm_logging 00306 * - type: Boolean 00307 * - description: Allows you to turn off logging for a run of 00308 * the storage manager. This is only for experimentation, to 00309 * measure logging overhead in a limited way. 00310 * Aborts, rollbacks and restart/recovery 00311 * do not work without logging. Independent concurrent 00312 * transactions using btrees might not work without logging (this is 00313 * not well-tested). 00314 * Each time you start the server, you had better start with a 00315 * clean device or a device that resulted from a clean shutdown 00316 * of the prior run. 00317 * - default: yes 00318 * - required?: no 00319 * 00320 * -sm_lock_caching 00321 * - type: Boolean 00322 * - description: Enables caching of transaction locks in transaction. 00323 * Can be turned off for experimentation. If no, the default is not 00324 * to cache locks, but any transaction can turn on caching for itself 00325 * by calling the ss_m method set_lock_cache_enable(bool enable). 00326 * - default: yes 00327 * - required?: no 00328 * 00329 */ 00330 00331 00332 /**\addtogroup SSMXCT 00333 * All storage manager operations on data must be done within the scope of 00334 * a transaction (ss_m::begin_xct, ss_m::commit_xct, ss_m::abort_xct, 00335 * ss_m::chain_xct). 00336 * 00337 * A very few storage manager operations, such as formatting a volume, are 00338 * called outside the scope of a transaction and the storage manager begins 00339 * its own transaction to do the work. 00340 * 00341 * Operations that fail return an error indication and the storage 00342 * manager assumes that the server will thereafter abort the 00343 * transaction in which the error occurred, when abort is indicated. 00344 * Abort is indicated when eUSERABORT or eDEADLOCK is returned and 00345 * when the erver chooses to abort rather than to work around the problem 00346 * (whatever it might be, such as eRETRY). 00347 * 00348 * The storage manager does not enforce the aborting of any erroneous 00349 * transactions except, possibly, those that are in danger of 00350 * running out of log space. 00351 * (This is done with the destructor of the prologue used on each call 00352 * to the storage manager, see next paragraph). 00353 * 00354 * It is always the server's responsibility to abort. 00355 * When the storage manager 00356 * encounters a eLOGSPACEWARN condition (the log hasn't enough 00357 * space \e at \e this \e moment to abort the running transaction, 00358 * assuming a 1:1 ration of rollback-logging overhead to forward-processing 00359 * logging overhead), it does one of two things: 00360 * - passes the error code eLOGSPACEWARN up the call stack back to the server 00361 * if the storage manager was constructed with no log-space-warning callback 00362 * argument (see LOG_WARN_CALLBACK_FUNC, ss_m::ss_m). 00363 * - tries to abort a transaction before passing an error code back up 00364 * the call stack to the server. Choosing a victim transaction to abort 00365 * is done by the server in its log-space-warning callback function (passed 00366 * in on ss_m::ss_m, q.v. 00367 * Only if that callback function returns a non-null victim transaction 00368 * and returns eUSERABORT does the storage manager abort that victim 00369 * before returning eUSERABORT up the call stack. Any other 00370 * error code returned by the callback function is just returned up 00371 * the call stack. 00372 * 00373 * \section LOCKS Locks 00374 * 00375 * The storage manager automatically acquires the 00376 * necessary locks when the data are read or written. 00377 * The locks thus acquired are normally released at the end of a transaction, 00378 * thus, by default, transactions are two-phase and well-formed (degree 3). 00379 * 00380 * \subsection GRAN Lock Granularity 00381 * The fine-grained locks are normally used for records in files, but 00382 * provision is made for using coarser-grained locks. The transaction 00383 * has a default lock level associated with it, 00384 * which governs the granularity of locks acquired by the storage manager 00385 * on behalf of the transaction. 00386 * The lock manager provides for lock escalation to coarser locks to 00387 * reduce the locking costs. See \ref SSMLOCK and smlevel_0::concurrency_t. 00388 * 00389 * Key-value locking is normally used for B+-Trees. (See \ref MOH1.) 00390 * R*-Trees normally use coarse-granularity locking. 00391 * The locking protocol used with an index is determined when the 00392 * index is created. A transaction may acquire coarse (index-level) 00393 * locks with explicit calls to the lock manager, but by default, 00394 * the granularity/level/protocol associated with the index is used. 00395 * See smlevel_0::concurrency_t. 00396 * 00397 * \section DISTXCT Distributed Transactions 00398 * Storage manager transactions may be used as "threads" (to 00399 * overload this term) of distributed transactions. 00400 * Coordination of 2-phase commit must be done externally, 00401 * but the storage manager supports preparing the (local) transaction "thread" 00402 * for two-phase commit, and it will log the necessary 00403 * data for recovering in-doubt transactions. 00404 * 00405 * \section ATTACH Threads and Transactions 00406 * Transactions are not tied to storage manager threads (smthread_t, not 00407 * to be confused with a local "thread" of a distributed transaction) in any 00408 * way other than that a transaction must be \e attached to a 00409 * thread while any storage manager work is being done on behalf of 00410 * that transaction. This is how the storage manager knows \e which 00411 * transaction is to acquire the locks and latches, etc. 00412 * But a thread can attach and detach from transactions at will, so 00413 * work may be performed by different threads each time the storage 00414 * manager is called on behalf of a given transaction; this allows the 00415 * server to keep a pool of threads to perform work and allows them to 00416 * perform work on behalf of any active transaction. 00417 * 00418 * \warning 00419 * While there are limited circumstances in which multiple threads can be 00420 * attached to the same transaction \e concurrently and perform storage 00421 * manager operations on behalf of that transaction concurrently, 00422 * which is a hold-over from the original storage manager, this 00423 * functionality will be deprecated soon. The reason for this being 00424 * removed is that it is extremely difficult to handle errors internally 00425 * when multiple threads are attached to a transaction because 00426 * partial rollback is impossible in the absence of multiple log streams 00427 * for a transaction. 00428 * 00429 * Under no circumstances may a thread attach to more than one transaction 00430 * at a time. 00431 * 00432 * 00433 * \section EXOTICA Exotica 00434 * The storage manager also provides 00435 * - partial rollback (ss_m::save_work and ss_m::rollback_work), 00436 * which undoes actions but does not release locks, 00437 * - transaction chaining (ss_m::chain_xct), which commits, but retains locks 00438 * and gives them to a new transaction, 00439 * - lock release (sm_quark_t, ss_m::unlock), allowing less-than-3-degree 00440 * transactions. 00441 * 00442 * To reduce the cost (particularly in logging) of loading databases, 00443 * the storage manager provides for unlogged loading of stores. 00444 * See \ref SSMSTORE. 00445 */ 00446 00447 00448 00449 /** \file sm_vas.h 00450 * \details 00451 * This is the include file that all value-added servers should 00452 * include to get the Shore Storage Manager API. 00453 * 00454 */ 00455 /********************************************************************/ 00456 00457 class page_p; 00458 class xct_t; 00459 class device_m; 00460 class vec_t; 00461 class log_m; 00462 class lock_m; 00463 class btree_m; 00464 class file_m; 00465 class pool_m; 00466 class dir_m; 00467 class chkpt_m; 00468 class lid_m; 00469 class sm_stats_cache_t; 00470 class option_group_t; 00471 class option_t; 00472 class prologue_rc_t; 00473 class rtree_m; 00474 class sort_stream_i; 00475 00476 /**\addtogroup SSMSP 00477 * A transaction may perform a partial rollback using savepoints. 00478 * The transaction populates a savepoint by calling ss_m::save_work, 00479 * then it may roll back to that point with ss_m::rollback_work. 00480 * Locks acquired between the save_work and rollback_work are \e not 00481 * released. 00482 */ 00483 00484 /**\brief A point to which a transaction can roll back. 00485 * \ingroup SSMSP 00486 *\details 00487 * A transaction an do partial rollbacks with 00488 * save_work and rollback_work, which use this class to determine 00489 * how far to roll back. 00490 * It is nothing more than a log sequence number for the work done 00491 * to the point when save_work is called. 00492 */ 00493 class sm_save_point_t : public lsn_t { 00494 public: 00495 NORET sm_save_point_t(): _tid(0,0) {}; 00496 friend ostream& operator<<(ostream& o, const sm_save_point_t& p) { 00497 return o << p._tid << ':' << (const lsn_t&) p; 00498 } 00499 friend istream& operator>>(istream& i, sm_save_point_t& p) { 00500 char ch; 00501 return i >> p._tid >> ch >> (lsn_t&) p; 00502 } 00503 tid_t tid() const { return _tid; } 00504 private: 00505 friend class ss_m; 00506 tid_t _tid; 00507 }; 00508 00509 /**\addtogroup SSMQK 00510 * A quark is a marker in the transaction's list of acquired locks. 00511 * One may release all short-duration locks acquired since the quark was inserted 00512 * into the list via sm_quark_t::open(). 00513 * The lock manager modifies the locks acquired inside a quark 00514 * so that non-extent locks are no longer than short-duration. 00515 * 00516 * This is for experimentation only, and is \e not well-tested or supported. 00517 * 00518 * How used: 00519 * \code 00520 * sm_quark_t *q = new sm_quark_t; 00521 * q->open(); // inserts marker in transaction's list. 00522 * ... 00523 * q->close(); // frees short-duration locks to the marker. 00524 * delete q; 00525 * \endcode 00526 * 00527 * Deleting the quark without closing it causes it to be closed. 00528 * Quarks may \e not be used with multi-threaded transactions. 00529 * 00530 * Note that if a transaction has multiple threads attached when 00531 * a thread opens a quark, there is no way to determine where the 00532 * quark takes effect, and since it affects the locks acquired by 00533 * all threads of the transaction, it must be used very carefully 00534 * where multiply-threaded transactions are concerned. 00535 */ 00536 00537 /**\brief List of locks acquired by a transaction since 00538 * the quark was "opened". 00539 * \ingroup SSMQK 00540 * \details 00541 * When a quark is closed (by calling close()), 00542 * the release_locks parameter indicates if all short-duration read 00543 * locks acquired during the quark should be released. 00544 * \note Quarks are an experimental feature for use 00545 * as a building block for a more general nested-transaction facility. 00546 * 00547 * \internal See lock_x.h 00548 */ 00549 class sm_quark_t { 00550 public: 00551 NORET sm_quark_t() {} 00552 NORET ~sm_quark_t(); 00553 00554 rc_t open(); 00555 rc_t close(bool release=true); 00556 00557 tid_t tid()const { return _tid; } 00558 operator bool()const { return (_tid != tid_t::null); } 00559 friend ostream& operator<<(ostream& o, const sm_quark_t& q); 00560 friend istream& operator>>(istream& i, sm_quark_t& q); 00561 00562 private: 00563 friend class ss_m; 00564 tid_t _tid; 00565 00566 // disable 00567 sm_quark_t(const sm_quark_t&); 00568 sm_quark_t& operator=(const sm_quark_t&); 00569 00570 }; 00571 00572 class sm_store_info_t; 00573 class log_entry; 00574 class coordinator; 00575 class tape_t; 00576 /**\brief \b This \b is \b the \b SHORE \b Storage \b Manager \b API. 00577 *\details 00578 * Most of the API for using the storage manager is through this 00579 * interface class. 00580 */ 00581 class ss_m : public smlevel_top 00582 { 00583 friend class pin_i; 00584 friend class sort_stream_i; 00585 friend class prologue_rc_t; 00586 friend class log_entry; 00587 friend class coordinator; 00588 friend class tape_t; 00589 public: 00590 00591 typedef smlevel_0::LOG_WARN_CALLBACK_FUNC LOG_WARN_CALLBACK_FUNC; 00592 typedef smlevel_0::LOG_ARCHIVED_CALLBACK_FUNC LOG_ARCHIVED_CALLBACK_FUNC; 00593 typedef smlevel_0::ndx_t ndx_t; 00594 typedef smlevel_0::concurrency_t concurrency_t; 00595 typedef smlevel_1::xct_state_t xct_state_t; 00596 00597 typedef sm_store_property_t store_property_t; 00598 00599 #ifdef COMMENT 00600 // 00601 // Below is most of the interface for the SHORE Storage Manager. 00602 // The rest is located in pin.h, scan.h, and smthread.h 00603 // 00604 00605 // 00606 // TEMPORARY FILES/INDEXES 00607 // 00608 // When a file or index is created there is a tmp_flag parameter 00609 // that when true indicates that the file is temporary. 00610 // Operations on a temporary file are not logged and the 00611 // file will be gone the next time the volume is mounted. 00612 // 00613 // TODO: IMPLEMENTATION NOTE on Temporary Files/Indexes: 00614 // Temp files cannot be trusted after transaction abort. 00615 // They should be marked for removal. 00616 // 00617 // CODE STRUCTURE: 00618 // Almost all ss_m functions begin by creating a prologue object 00619 // whose constructor and descructor check for many common errors. 00620 // In addition most ss_m::OP() functions now call an ss_m::_OP() 00621 // function to do the real work. The ss_m::OP functions should 00622 // not be called by other ss_m functions, instead the corresponding 00623 // ss_m::_OP function should be used. 00624 // 00625 00626 #endif /* COMMENT */ 00627 00628 public: 00629 /**\brief Add storage manager options to the given options group. 00630 *\ingroup SSMINIT 00631 *\details 00632 * @param[in] grp The caller's option group, to which the 00633 * storage manager's options will be added for processing soon. 00634 * 00635 * Before the ss_m constructor can be called, setup_options 00636 * \b must be called. This will install the storage manager's options and 00637 * initialize any that are not required. 00638 * Once all required options have been set, an ss_m can be constructed. 00639 * 00640 *\note This is not thread-safe. The application (server) must prevent 00641 * concurrent calls to setup_options. 00642 */ 00643 static rc_t setup_options(option_group_t* grp); 00644 00645 /**\brief Initialize the storage manager. 00646 * \ingroup SSMINIT 00647 * \details 00648 * @param[in] warn A callback function. This is called 00649 * when/if the log is in danger of becoming "too full". 00650 * @param[in] get A callback function. This is called 00651 * when the storage manager needs an archived log file to be restored. 00652 * 00653 * When an ss_m object is created, the storage manager initializes itself 00654 * and, 00655 * if the sthreads package has not already been initialized by virtue 00656 * of an sthread_t running, the sthreads package is initialized now. 00657 * 00658 * The log is read and recovery is performed (\ref MHLPS), 00659 * and control returns to 00660 * the caller, after which time 00661 * storage manager threads (instances of smthread_t) may be constructed and 00662 * storage manager may be used. 00663 * 00664 * The storage manager is used by invoking its static methods. 00665 * You may use them as follows: 00666 * \code 00667 * ss_m *UNIQ = new ss_m(); 00668 * 00669 * W_DO(UNIQ->mount_dev(...)) 00670 * // or 00671 * W_DO(ss_m::mount_dev(...)) 00672 * \endcode 00673 * ). 00674 * 00675 * Only one ss_m object may be extant at any time. If you try 00676 * to create another while the one exists, a fatal error will occur 00677 * (your program will choke with a message about your mistake). 00678 * 00679 * The callback argument given to the storage manager constructor 00680 * is called when the storage manager determines that it is in danger 00681 * of running out of log space. Heuristics are used to guess when 00682 * this is the case. 00683 * 00684 * If the function \a warn archives and removes log files, the function 00685 * \a get must be provided to restore those log files when the 00686 * storage manager needs them. 00687 * 00688 * For details and examples, see \ref smlevel_0::LOG_WARN_CALLBACK_FUNC, 00689 * \ref smlevel_0::LOG_ARCHIVED_CALLBACK_FUNC, and 00690 * \ref LOGSPACE. 00691 */ 00692 ss_m(LOG_WARN_CALLBACK_FUNC warn=NULL, LOG_ARCHIVED_CALLBACK_FUNC get=NULL); 00693 00694 /**\brief Shut down the storage manager. 00695 * \ingroup SSMINIT 00696 * \details 00697 * When the storage manager object is deleted, it shuts down. 00698 * Thereafter it is not usable until another ss_m object is 00699 * constructed. 00700 */ 00701 ~ss_m(); 00702 00703 /**\brief Cause the storage manager's shutting down do be done cleanly 00704 * or to simulate a crash. 00705 * \ingroup SSMINIT 00706 * \details 00707 * @param[in] clean True means shut down gracefully, false means simulate a crash. 00708 * 00709 * When the storage manager's destructor is called 00710 * the buffer pool is flushed to disk, unless this method is called 00711 * with \a clean == \e false. 00712 * 00713 * \note If this method is used, it 00714 * must be called after the storage manager is 00715 * constructed if it is to take effect. Each time the storage 00716 * manager is constructed, the state associated with this is set 00717 * to \e true, i.e., "shut down properly". 00718 * 00719 * \note This method is not thread-safe, only one thread should use this 00720 * at any time, presumably just before shutting down. 00721 */ 00722 static void set_shutdown_flag(bool clean); 00723 00724 /**\brief Notify storage manager when a log file was archived by a 00725 * LOG_WARN_CALLBACK_FUNC. 00726 * 00727 * The arguments: 00728 * @param[in] logfile Character string name of file archived. 00729 */ 00730 static rc_t log_file_was_archived(const char * logfile); 00731 00732 private: 00733 void _construct_once(LOG_WARN_CALLBACK_FUNC x=NULL, 00734 LOG_ARCHIVED_CALLBACK_FUNC y=NULL); 00735 void _destruct_once(); 00736 00737 00738 public: 00739 /**\addtogroup SSMXCT 00740 * 00741 * All work performed on behalf of a transaction must occur while that 00742 * transaction is "attached" to the thread that performs the work. 00743 * Creating a transaction attaches it to the thread that creates the transaction. 00744 * The thread may detach from the transaction and attach to another. 00745 * Multiple threads may attach to a single transaction and do work in certain circumstances. See \ref SSMMULTIXCT 00746 * 00747 * 00748 */ 00749 /**\brief Begin a transaction 00750 *\ingroup SSMXCT 00751 * @param[in] timeout Optional, controls blocking behavior. 00752 * \details 00753 * 00754 * Start a new transaction and "attach" it to this thread. 00755 * No running transaction may be attached to this thread. 00756 * 00757 * Storage manager methods that must block (e.g., to acquire a lock) 00758 * will use the timeout given. 00759 * The default timeout is the one associated with this thread. 00760 * 00761 * \sa timeout_in_ms 00762 */ 00763 static rc_t begin_xct( 00764 timeout_in_ms timeout = WAIT_SPECIFIED_BY_THREAD); 00765 00766 /**\brief Begin an instrumented transaction. 00767 *\ingroup SSMXCT 00768 * @param[in] stats Pointer to an allocated statistics-holding structure. 00769 * @param[in] timeout Optional, controls blocking behavior. 00770 * \details 00771 * No running transaction may be already attached to this thread. 00772 * A new transaction is started and attached to the running thread. 00773 * 00774 * The transaction will be instrumented. 00775 * This structure is updated by the storage manager whenever a thread 00776 * detaches from this transaction. The activity recorded during 00777 * the time the thread is attached to the transcation will be stored in 00778 * the per-transaction statistics. 00779 * \attention It is the client's 00780 * responsibility to delete the statistics-holding structure. 00781 * 00782 * Storage manager methods that must block (e.g., to acquire a lock) 00783 * will use the timeout given. 00784 * The default timeout is the one associated with this thread. 00785 * 00786 * \sa timeout_in_ms 00787 */ 00788 static rc_t begin_xct( 00789 sm_stats_info_t* stats, // allocated by caller 00790 timeout_in_ms timeout = WAIT_SPECIFIED_BY_THREAD); 00791 00792 /**\brief Begin a transaction and return the transaction id. 00793 *\ingroup SSMXCT 00794 * @param[out] tid Transaction id of new transaction. 00795 * @param[in] timeout Optional, controls blocking behavior. 00796 * \details 00797 * 00798 * No running transaction may be attached to this thread. 00799 * 00800 * Storage manager methods that must block (e.g., to acquire a lock) 00801 * will use the timeout given. 00802 * The default timeout is the one associated with this thread. 00803 * 00804 * \sa timeout_in_ms 00805 */ 00806 static rc_t begin_xct( 00807 tid_t& tid, 00808 timeout_in_ms timeout = WAIT_SPECIFIED_BY_THREAD); 00809 00810 /**\addtogroup SSM2PC 00811 * The storage manager contains support for externally-coordinated 00812 * transactions that use 00813 * two-phase-commit with presumed abort. 00814 * The server must provide the coordination and the coordinator is 00815 * assumed to have its own stable storage, and it is assumed to recover 00816 * from failures in a "short time", the precise meaning of which is given below. 00817 * A prepared transaction, like an active transaction, 00818 * consumes log space and holds locks. 00819 * Even if a prepared transaction does not hold locks needed by 00820 * other transactions, it consumes resources in a way that can interfere 00821 * with other transactions. 00822 * If a prepared transaction remains in the system for a long time 00823 * while other transactions are running, eventually the storage 00824 * manager needs the log space used (reserved) by the prepared transaction. 00825 * A coordinator must resolve its prepared transactions 00826 * before the storage manager effectively runs out of 00827 * log space for other transactions in the system. 00828 * The amount of time involved is a function of the size of the log 00829 * and of the demands of the other transactions in the system. 00830 * 00831 * For the purpose of this discussion, the portion of a global 00832 * transaction that involves a single Shore Storage Manager transaction is 00833 * called a thread of the global transaction. 00834 * 00835 * A Shore transaction participates as a thread of a global transaction 00836 * as follows: 00837 - Start a storage-manager transaction with ss_m::begin_xct. 00838 - Acquire a global transaction identifier from the coordinator. 00839 - Indicate to the storage manager that this transaction is a 00840 thread of a global transaction, and associate the global transaction 00841 identifier with this thread by calling ss_m::enter_2pc. 00842 - Associate a coordinator with the transaction for recovery 00843 purposes, by calling ss_m::set_coordinator. 00844 - Prepare the thread of the transaction and get the storage manager's 00845 vote with ss_m::prepare_xct. 00846 It is an error to commit a global transaction thread without first 00847 preparing it. It is an error to do anything else 00848 in a transaction after it is prepared, except to end 00849 the transaction or retry the prepare (to get the vote again). 00850 - Convey the vote to the coordinator, and determine the transaction's 00851 fate from the coordinator. 00852 - End the thread with ss_m::commit_xct or ss_m::abort_xct. 00853 * 00854 * The storage manager 00855 * logs the minimal information required to effect a vote of the 00856 * transaction threads that are storage manager transactions, 00857 * and to recover such in-doubt transactions after restart. 00858 * Thus, after a crash/restart, the server may query the storage manager 00859 * about in-doubt (prepared) transactions with ss_m::query_prepared_xct, 00860 * which tells the caller the number and global transaction IDs associated 00861 * with prepared transactions. 00862 * Using this, the server contacts the coordinator and resumes the 00863 * voting. 00864 * The server may find the local transaction IDs and use ss_m::tid_to_xct 00865 * to attach these transactions and to resolve them. 00866 * 00867 * Commit and abort of read-only transactions are the same, 00868 * as these transactions have no log entries. Preparing read-only transactions 00869 * causes them to commit/abort and the vote returned is vote_readonly. 00870 * Once this vote is communicated to the coordinator and the coordinator 00871 * records it on stable storage, there is no need to involve this thread in 00872 * any further processing. For this reason, 00873 * read-only transactions do not appear as prepared transactions at 00874 * recovery time. 00875 * 00876 */ 00877 00878 /**\brief Make the attached transaction a thread of a distributed transaction. 00879 *\ingroup SSM2PC 00880 * 00881 * @param[in] gtid Global transaction ID to associate with this transaction. This will be logged when the transaction is prepared. 00882 * 00883 * \note This can be called at most once for a given transaction. 00884 * The transaction must be attached to the calling thread. 00885 * No other threads may be attached to the transaction. 00886 */ 00887 static rc_t enter_2pc(const gtid_t >id); 00888 /**\brief Assign a coordinator handle to this distributed transaction. 00889 *\ingroup SSM2PC 00890 * @param[in] h Handle of the coordinator. Not interpreted by 00891 * the storage manager. 00892 * 00893 * The storage manager associates this server handle with the transaction 00894 * so that when the transaction is prepared, this information is 00895 * written to the log. Upon recovery, if this transaction is still in doubt, 00896 * the value-added server can query the 00897 * storage manager for in-doubt transactions, get their server handles, 00898 * and resolve the transactions. 00899 * See query_prepared_xct and recover_2pc. 00900 */ 00901 static rc_t set_coordinator(const server_handle_t &h); 00902 00903 /**\brief Prepare a thread of a distributed transaction. 00904 *\ingroup SSM2PC 00905 * @param[in] stats Pointer to an allocated statistics-holding 00906 * structure. 00907 * @param[out] vote This thread's vote. 00908 * 00909 * The storage manager will prepare the attached transaction (a thread 00910 * of a distributed transaction) for commit. 00911 * If this transaction has performed no logged updates, the 00912 * vote returned will be vote_readonly. 00913 * If this transaction can commit, the vote returned will be vote_commit. 00914 * If an error occurs during the prepare, the vote will be vote_abort. 00915 * 00916 * If the transaction is being instrumented, the 00917 * statistics-holding structure will be returned to the caller, 00918 * and the caller is responsible for its deallocation. 00919 */ 00920 static rc_t prepare_xct( 00921 sm_stats_info_t*& stats, 00922 vote_t& vote); 00923 00924 /**\brief Prepare a thread of a distributed transaction. 00925 *\ingroup SSM2PC 00926 * @param[out] vote This thread's vote. See \ref w_base_t::vote_t. 00927 * 00928 * The storage manager will prepare the attached transaction (a thread 00929 * of a distributed transaction) for commit. 00930 * If this transaction has performed no logged updates, the 00931 * vote returned will be vote_readonly. 00932 * If this transaction can commit, the vote returned will be vote_commit. 00933 * If an error occurs during the prepare, the vote will be vote_abort. 00934 */ 00935 static rc_t prepare_xct(vote_t &vote); 00936 00937 /**\brief Force the transaction to vote "read-only" in a two-phase commit. 00938 *\ingroup SSM2PC 00939 * \details 00940 * This will override the storage manager's determination of 00941 * whether this thread of a distributed transaction is read-only, which is 00942 * based on whether the local transaction thread logged anything. This 00943 * method may be useful if the local transaction rolled back to 00944 * a savepoint. 00945 * See \ref w_base_t::vote_t. 00946 */ 00947 static rc_t force_vote_readonly(); 00948 00949 /**\brief Given a global transaction id, find the local prepared 00950 * transaction associated with it. 00951 *\ingroup SSM2PC 00952 * @param[in] gtid A global transaction ID (an opaque quantity 00953 * to the storage manager). 00954 * @param[in] mayblock Not used. 00955 * @param[out] local Return the transaction ID of the prepared 00956 * SM transaction. 00957 * \details 00958 * Searches the transaction list for a prepared transaction with the given 00959 * global transaction id. If found, it returns a reference to the 00960 * local transaction. The transaction is attached to the running 00961 * thread before it is returned. 00962 */ 00963 static rc_t recover_2pc(const gtid_t & gtid, 00964 bool mayblock, 00965 tid_t & local 00966 ); 00967 00968 /**\brief Return the number of prepared transactions. 00969 *\ingroup SSM2PC 00970 * @param[out] numtids The number of in-doubt transactions. 00971 * \details 00972 * Used by a server at start-up, after recovery, to find out if 00973 * there are any in-doubt transactions. If so, the server must 00974 * use the second form of query_prepared_xct to find the global 00975 * transaction IDs of these in-doubt transactions. 00976 */ 00977 static rc_t query_prepared_xct(int &numtids); 00978 00979 /**\brief Return the global transaction IDs of in-doubt transactions. 00980 *\ingroup SSM2PC 00981 * @param[in] numtids The number of global transaction ids in the list. 00982 * @param[in] l The caller-provided list into which to write the 00983 * global transaction-ids. 00984 * \details 00985 * Used by a server at start-up, after recovery, to find out the 00986 * global transaction IDs of the prepared transactions. The storage 00987 * manager fills in the first numtids entries of the pre-allocated list. 00988 * The server may have first called the first form of query_prepared_xct 00989 * to find out how many such transactions there are after recovery. 00990 * 00991 * \attention Read-only transactions 00992 * do not appear as in-doubt transactions. Because they did not 00993 * generate any log records, they will not be "discovered" by analysis. 00994 * The server must determine that any thread of a global transaction that 00995 * does not appear to be in doubt was a read-only thread or 00996 * it never prepared and thus has been aborted. 00997 * Read-only transactions that were prepared would have voted read-only, 00998 * and if the coordinator recorded that vote on stable storage, it 00999 * should not be concerned with these transaction threads any further. 01000 * If the coordinator does not have this information recorded, the 01001 * transaction thread could have been an aborted non-read-only transaction, 01002 * so the coordinator must, in this case, presume that the thread aborted 01003 * and thus make the global transaction abort. 01004 */ 01005 static rc_t query_prepared_xct(int numtids, gtid_t l[]); 01006 01007 01008 /**\brief Commit a transaction. 01009 *\ingroup SSMXCT 01010 * @param[in] lazy Optional, controls flushing of log. 01011 * @param[out] plastlsn If non-null, this is a pointer to a 01012 * log sequence number into which the storage 01013 * manager writes the that of the last log record 01014 * inserted for this transaction. 01015 * \details 01016 * 01017 * Commit the attached transaction and detach it, destroy it. 01018 * If \a lazy is true, the log is not synced. This means that 01019 * recovery of this transaction might not be possible. 01020 */ 01021 static rc_t commit_xct( 01022 bool lazy = false, 01023 lsn_t* plastlsn=NULL); 01024 01025 /**\brief Commit an instrumented transaction and get its statistics. 01026 *\ingroup SSMXCT 01027 * @param[out] stats Get a copy of the statistics for this transaction. 01028 * @param[in] lazy Optional, controls flushing of log. 01029 * @param[out] plastlsn If non-null, this is a pointer to a 01030 * log sequence number into which the storage 01031 * manager writes the that of the last log record 01032 * inserted for this transaction. 01033 * \details 01034 * 01035 * Commit the attached transaction and detach it, destroy it. 01036 * If \a lazy is true, the log is not synced. This means that 01037 * recovery of this transaction might not be possible. 01038 */ 01039 static rc_t commit_xct( 01040 sm_stats_info_t*& stats, 01041 bool lazy = false, 01042 lsn_t* plastlsn=NULL); 01043 01044 /**\brief Commit an instrumented transaction and start a new one. 01045 *\ingroup SSMXCT 01046 * @param[out] stats Get a copy of the statistics for the first transaction. 01047 * @param[in] lazy Optional, controls flushing of log. 01048 * \details 01049 * 01050 * Commit the attached transaction and detach it, destroy it. 01051 * Start a new transaction and attach it to this thread. 01052 * \note \e The \e new 01053 * \e transaction \e inherits \e the \e locks \e of \e the \e old 01054 * \e transaction. 01055 * 01056 * If \a lazy is true, the log is not synced. This means that 01057 * recovery of this transaction might not be possible. 01058 */ 01059 static rc_t chain_xct( 01060 sm_stats_info_t*& stats, /* in w/new, out w/old */ 01061 bool lazy = false); 01062 01063 /**\brief Commit a transaction and start a new one, inheriting locks. 01064 *\ingroup SSMXCT 01065 * @param[in] lazy Optional, controls flushing of log. 01066 * \details 01067 * 01068 * Commit the attached transaction and detach it, destroy it. 01069 * Start a new transaction and attach it to this thread. 01070 * \note \e The \e new 01071 * \e transaction \e inherits \e the \e locks \e of \e the \e old 01072 * \e transaction. 01073 * 01074 * If \a lazy is true, the log is not synced. This means that 01075 * recovery of the committed transaction might not be possible. 01076 */ 01077 static rc_t chain_xct(bool lazy = false); 01078 01079 01080 /**\brief Commit a group of transactions. 01081 *\ingroup SSMXCT 01082 * @param[in] list List of pointers to transactions to commit. 01083 * @param[in] listlen Number of transactions in the list. 01084 * \details 01085 * 01086 * Commit each transaction in the list as an all-or-none affair. 01087 * Any transaction that is attached to the thread will be 01088 * detached before anything is done. 01089 * 01090 * The purpose of this method is to allow multiple transactions 01091 * to commit together with a single log record. No voting takes place. 01092 * The entire list of transaction identifiers must fit in a single 01093 * log record. If it does not, a descriptive error will be returned and no 01094 * transaction will be committed. In this case, the server has the 01095 * option to singly commit each transaction. 01096 * 01097 * If any other error occurs during one of the commits, the error 01098 * will be returned to the caller and none of the transactions 01099 * will be committed; they \b must be aborted thereafter. 01100 * 01101 * This is not intended to be used with transactions that are 01102 * participating in two-phase commit, but if 01103 * one of the transactions is participating in two-phase commit, 01104 * they all must be and they all must be prepared. 01105 * 01106 * Chaining and lazy commit are not offered with this form of commit. 01107 * If a transaction in the list is instrumented, its statistics 01108 * resources will be deleted upon successful commit. 01109 * 01110 * \note 01111 * By taking a list of transaction pointers, this avoids a the tid_to_xct lookup 01112 * for each transaction, but the server must regard the transaction pointers as 01113 * invalid after this method returns. 01114 * The transactions, once committed, do not exist anymore. 01115 * If an error is returned, the server has to re-verify the transaction pointers 01116 * by using ss_m::tid_to_xct from a separate list of transaction ids to determine 01117 * which transactions are extant. 01118 */ 01119 static rc_t commit_xct_group( 01120 xct_t * list[], 01121 int listlen); 01122 01123 /**\brief Abort an instrumented transaction and get its statistics. 01124 *\ingroup SSMXCT 01125 * @param[out] stats Get a copy of the statistics for this transaction. 01126 * \details 01127 * 01128 * Abort the attached transaction and detach it, destroy it. 01129 */ 01130 static rc_t abort_xct(sm_stats_info_t*& stats); 01131 /**\brief Abort a transaction. 01132 *\ingroup SSMXCT 01133 * \details 01134 * 01135 * Abort the attached transaction and detach it, destroy it. 01136 */ 01137 static rc_t abort_xct(); 01138 01139 /**\brief Populate a save point. 01140 *\ingroup SSMSP 01141 * @param[out] sp An sm_save_point_t owned by the caller. 01142 *\details 01143 * Store in sp the needed information to be able to roll back 01144 * to this point. 01145 * For use with rollback_work. 01146 * \note Only one thread may be attached to a transaction when this 01147 * is called. 01148 */ 01149 static rc_t save_work(sm_save_point_t& sp); 01150 01151 /**\brief Roll back to a savepoint. 01152 *\ingroup SSMSP 01153 * @param[in] sp An sm_save_point_t owned by the caller and 01154 * populated by save_work. 01155 *\details 01156 * Undo everything that was 01157 * done from the time save_work was called on this savepoint. 01158 * \note Locks are not freed. 01159 * 01160 * \note Only one thread may be attached to a transaction when this 01161 * is called. 01162 */ 01163 static rc_t rollback_work(const sm_save_point_t& sp); 01164 01165 /**\brief Return the number of transactions in active state. 01166 *\ingroup SSMXCT 01167 * \details 01168 * While this is thread-safe, the moment a value is returned, it could 01169 * be out of date. 01170 * Useful only for debugging. 01171 */ 01172 static w_base_t::uint4_t num_active_xcts(); 01173 01174 /**\brief Attach the given transaction to the currently-running smthread_t. 01175 *\ingroup SSMXCT 01176 * \details 01177 * It is assumed that the currently running thread is an smthread_t. 01178 */ 01179 static void attach_xct(xct_t *x) { me()->attach_xct(x); } 01180 01181 /**\addtogroup SSMMULTIXCT 01182 * 01183 * Certain operations may be performed while more than one 01184 * thread is attached to a transaction (this functionality is 01185 * soon to be deprecated). 01186 * Any number of attached threads may be read-only. 01187 * The kinds of updates that can be made by multiple threads are limited by 01188 * the need to avoid latch-mutex and latch-latch deadlocks. 01189 * 01190 * There are several reasons for this. 01191 * 1) The multiple threads are not protected from each other by locks. 01192 * 2) Interleaving of top-level actions is not supported with rollback; 01193 * this means that for the duration of a top-level action, a thread needs 01194 * access to the log that excludes all other threads in 01195 * the same transaction. 01196 * 01197 * The internal logging protocol is this: 01198 * T1: latch page, log update. Logging requires acquiring a mutex 01199 * on the xct's log buffer. 01200 * T2: performing any top-level action, acquires the mutex on the 01201 * xct's log buffer before doing the action (latching the page). 01202 * 01203 * Thus, anything involving top-level actions is suspect. B-trees 01204 * use top-level actions, as does file-page allocation, and creation/ 01205 * destruction of stores (files, indexes). Thus, just about 01206 * any kind of concurrent updates on the same page 01207 * in the same transaction is problematic, and just about any update 01208 * can result in latching extent-map or store-map pages. 01209 * This activity could be disallowed by enforcing a strict 01210 * rule that at most one update operation can be going on 01211 * in a transaction at any time, however this is too restrictive. 01212 * 01213 * Multiple updating threads can 01214 * work \b if \b the \b data \b are \b partitioned by volume. 01215 * So a well-behaved server may use multiple-threaded transactions 01216 * to do updates as long as the updates are on different \b volumes. 01217 * It might also allow read-only transaction threads to be 01218 * concurrent with a single updating thread. 01219 * 01220 * Savepoints and partial rollback may \e not be used with 01221 * multi-threaded transactions. This is not enforced by the storage 01222 * manager; it is poor behavior on the part of a server. 01223 * For example, the behavior of the following is undefined: 01224 * - thread 1: attach, read, read, read, ... 01225 * - thread 2: attach, save work, update, rollback 01226 * If the two threads are reading and possibly updating the same 01227 * data, the results are timing-dependent and could produce a latch- 01228 * latch or latch-mutex deadlock. 01229 * 01230 * Ongoing research at DIAS is investigating ways to extend the usefulness 01231 * of parallelism within a transaction (multi-threaded transactions). 01232 * Current thoughts about this are for servers to coordinate multiple 01233 * transactions using two-phase commit or an optimized version 01234 * of commit and abort for groups of local transactions. 01235 */ 01236 01237 /**\brief Detach any attached from the currently-running smthread_t. 01238 *\ingroup SSMXCT 01239 * \details 01240 * Sever the connection between the running thread and the transaction. 01241 * This allow the running thread to attach a different 01242 * transaction and to perform work in its behalf. 01243 */ 01244 static void detach_xct() { xct_t *x = me()->xct(); 01245 if(x) me()->detach_xct(x); } 01246 01247 /**\brief Get the transaction structure for a given a transaction id. 01248 *\ingroup SSMXCT 01249 * @param[in] tid Transaction ID. 01250 *\details 01251 * Return a pointer to the storage manager's transaction structure. 01252 * Can be used with detach_xct and attach_xct. 01253 */ 01254 static xct_t* tid_to_xct(const tid_t& tid); 01255 /**\brief Get the transaction ID for a given a transaction structure. 01256 *\ingroup SSMXCT 01257 * @param[in] x Pointer to transaction structure. 01258 *\details 01259 * Return the transaction ID for the given transaction. 01260 */ 01261 static tid_t xct_to_tid(const xct_t* x); 01262 01263 /**\brief Print transaction information to an output stream. 01264 *\ingroup SSMAPIDEBUG 01265 * @param[in] o Stream to which to write the information. 01266 * \details 01267 * This is for debugging only, and is not thread-safe. 01268 */ 01269 static rc_t dump_xcts(ostream &o); 01270 01271 /**\brief Get the transaction state for a given transaction (structure). 01272 *\ingroup SSMXCT 01273 * @param[in] x Pointer to transaction structure. 01274 * \details 01275 * Returns the state of the transaction (active, prepared). It is 01276 * hard to get the state of an aborted or committed transaction, since 01277 * their structures no longer exist. 01278 */ 01279 static xct_state_t state_xct(const xct_t* x); 01280 01281 /**\brief Return the amount of log this transaction would consume 01282 * if it rolled back. 01283 *\ingroup SSMXCT 01284 * 01285 * If a transaction aborts with eOUTOFLOGSPACE this function can 01286 * be used in conjunction with xct_reserve_log_space to 01287 * pre-allocate the needed amount of log space before retrying. 01288 */ 01289 static smlevel_0::fileoff_t xct_log_space_needed(); 01290 01291 /**\brief Require the specified amount of log space to be 01292 * available for this transaction before continuing. 01293 *\ingroup SSMXCT 01294 * 01295 * If a transaction risks running out of log space it can 01296 * pre-request some or all of the needed amount before starting in 01297 * order to improve its chances of success. Other new transactions 01298 * will be unable to acquire log space before this request is 01299 * granted (existing ones will be able to commit, unless they also 01300 * run out of space, because that tends to free up log space and 01301 * avoids wasting work). 01302 */ 01303 static rc_t xct_reserve_log_space(fileoff_t amt); 01304 01305 /**\brief Get the locking granularity for the attached transaction. 01306 * \ingroup SSMLOCK 01307 */ 01308 static concurrency_t xct_lock_level(); 01309 /**\brief Set the default locking level for the attached transaction. 01310 * \ingroup SSMLOCK 01311 * \details 01312 * @param[in] l The level to use for the balance of this transaction. 01313 * Legitimate values are t_cc_record, t_cc_page, t_cc_file. 01314 * 01315 * \note Only one thread may be attached to the transaction when this 01316 * is called. If more than one thread is attached, a fatal error 01317 * will ensue. 01318 */ 01319 static void set_xct_lock_level(concurrency_t l); 01320 01321 /**\brief Collect transaction information in a virtual table. 01322 * \ingroup SSMVTABLE 01323 * \details 01324 * @param[out] v The virtual table to populate. 01325 * @param[in] names_too If true, make the 01326 * first row of the table a list of the attribute names. 01327 * 01328 * All attribute values will be strings. 01329 * The virtual table v can be printed with its output operator 01330 * operator<< for ostreams. 01331 * 01332 * \attention Not atomic. Can yield stale data. 01333 */ 01334 static rc_t xct_collect(vtable_t&v, bool names_too=true); 01335 01336 /**\brief Collect buffer pool information in a virtual table. 01337 * \ingroup SSMVTABLE 01338 * \details 01339 * @param[out] v The virtual table to populate. 01340 * @param[in] names_too If true, make the 01341 * first row of the table a list of the attribute names. 01342 * 01343 * \attention Be wary of using this with a large buffer pool. 01344 * 01345 * All attribute values will be strings. 01346 * The virtual table v can be printed with its output operator 01347 * operator<< for ostreams. 01348 * 01349 * \attention Not atomic. Can yield stale data. 01350 */ 01351 static rc_t bp_collect(vtable_t&v, bool names_too=true); 01352 01353 /**\brief Collect lock table information in a virtual table. 01354 * \ingroup SSMVTABLE 01355 * \details 01356 * @param[out] v The virtual table to populate. 01357 * @param[in] names_too If true, make the 01358 * first row of the table a list of the attribute names. 01359 * 01360 * All attribute values will be strings. 01361 * The virtual table v can be printed with its output operator 01362 * operator<< for ostreams. 01363 * 01364 * \attention Not atomic. Can yield stale data. 01365 * Cannot be used in a multi-threaded-transaction context. 01366 */ 01367 static rc_t lock_collect(vtable_t&v, bool names_too=true); 01368 01369 /**\brief Collect thread information in a virtual table. 01370 * \ingroup SSMVTABLE 01371 * \details 01372 * @param[out] v The virtual table to populate. 01373 * @param[in] names_too If true, make the 01374 * first row of the table a list of the attribute names. 01375 * 01376 * All attribute values will be strings. 01377 * The virtual table v can be printed with its output operator 01378 * operator<< for ostreams. 01379 * 01380 * \attention Not thread-safe. Can yield stale data. 01381 */ 01382 static rc_t thread_collect(vtable_t&v, bool names_too=true); 01383 01384 /**\brief Take a checkpoint. 01385 * \ingroup SSMAPIDEBUG 01386 * \note For debugging only! 01387 * 01388 * Force the storage manager to take a checkpoint. 01389 * Checkpoints are fuzzy : they can be taken while most other 01390 * storage manager activity is happening, even though they have 01391 * to be serialized with respect to each other, and with respect to 01392 * a few other activities. 01393 * 01394 * This is thread-safe. 01395 */ 01396 static rc_t checkpoint(); 01397 01398 /**\brief Force the buffer pool to flush its pages to disk. 01399 * \ingroup SSMAPIDEBUG 01400 * @param[in] invalidate True means discard pages after flush. 01401 * \note For debugging only! 01402 * \attention Do not call force_buffers with anything pinned. 01403 * You may cause latch-latch deadlocks, as this method has 01404 * to scan the entire buffer pool and possibly EX-latch pages to prevent 01405 * others from updating while it forces to disk. 01406 * Since the page-order is essentially random, we cannot 01407 * preclude latch-latch deadlocks with other threads. 01408 */ 01409 static rc_t force_buffers(bool invalidate = false); 01410 01411 /**\brief Force the buffer pool to flush the volume header page(s) 01412 * to disk. 01413 * \ingroup SSMAPIDEBUG 01414 * @param[in] vid ID of the volume of interest 01415 * \note For debugging only! 01416 * \attention Do not call force_vol_hdr_buffers with anything pinned. 01417 * You could cause latch-latch deadlocks, as this method has 01418 * to scan the entire buffer pool and possibly EX-latch some pages. 01419 * Since the page-order is essentially random, we cannot 01420 * preclude latch-latch deadlocks with other threads. 01421 */ 01422 static rc_t force_vol_hdr_buffers( const vid_t& vid); 01423 01424 /**\brief Force the buffer pool to flush to disk all pages 01425 * for the given store. 01426 * \ingroup SSMAPIDEBUG 01427 * @param[in] stid Store whose pages are to be flushed. 01428 * @param[in] invalidate True means discard the pages after flushing. 01429 * \note For debugging only! 01430 * \attention Do not call force_store_buffers with anything pinned. 01431 * You may cause latch-latch deadlocks, as this method has 01432 * to scan the entire buffer pool and, if invalide==true, 01433 * EX-latch pages to prevent others from updating 01434 * while it forces to disk. 01435 * Since the page-order is essentially random, we cannot 01436 * preclude latch-latch deadlocks with other threads. 01437 */ 01438 static rc_t force_store_buffers(const stid_t & stid, 01439 bool invalidate); 01440 01441 /**\cond skip 01442 * Do not document. Very un-thread-safe. 01443 */ 01444 static rc_t dump_buffers(ostream &o); 01445 static rc_t dump_locks(ostream &o); 01446 static rc_t dump_locks(); // defaults to std::cout 01447 static rc_t dump_exts(ostream &o, 01448 vid_t v, 01449 extnum_t start, 01450 extnum_t end); 01451 01452 static rc_t dump_stores(ostream &o, 01453 vid_t v, 01454 int start, 01455 int end); 01456 01457 static rc_t dump_histo(ostream &o, bool locked); 01458 01459 static rc_t snapshot_buffers( 01460 u_int& ndirty, 01461 u_int& nclean, 01462 u_int& nfree, 01463 u_int& nfixed); 01464 /**\endcond skip */ 01465 01466 /**\brief Get a copy of the statistics from an attached instrumented transaction. 01467 * \ingroup SSMXCT 01468 * \details 01469 * @param[out] stats Returns a copy of the statistics for this transaction. 01470 * @param[in] reset If true, the statistics for this transaction will be zeroed. 01471 */ 01472 static rc_t gather_xct_stats( 01473 sm_stats_info_t& stats, 01474 bool reset = false); 01475 01476 /**\brief Get a copy of the global statistics. 01477 * \ingroup SSMSTATS 01478 * \details 01479 * @param[out] stats A pre-allocated structure. 01480 */ 01481 static rc_t gather_stats( 01482 sm_stats_info_t& stats 01483 ); 01484 01485 /**\brief Get a copy of configuration-dependent information. 01486 * \ingroup OPT 01487 * \details 01488 * @param[out] info A pre-allocated structure. 01489 */ 01490 static rc_t config_info(sm_config_info_t& info); 01491 01492 /**\brief Set sleep time before I/O operations. 01493 * \ingroup SSMVOL 01494 * \details 01495 * This method sets a milli_sec delay to occur before 01496 * each disk read/write operation. This is for debugging. 01497 * It is useful in discovering thread sync bugs. 01498 * This delay applies to all threads. 01499 */ 01500 static rc_t set_disk_delay(u_int milli_sec); 01501 01502 /**\cond skip */ 01503 // TODO : document crash testing facilities 01504 /**\brief Simulate a crash 01505 * \details 01506 * This method tells the log manager to start generating corrupted 01507 * log records. This will make it appear that a crash occurred 01508 * at that point in the log. A call to this method should be 01509 * followed immediately by a dirty shutdown of the ssm. 01510 */ 01511 static rc_t start_log_corruption(); 01512 01513 /* for smsh/debugging: 01514 * log an arbitrary message */ 01515 static rc_t log_message(const char * const msg); 01516 /**\endcond skip */ 01517 01518 // Forces a log flush 01519 static rc_t sync_log(bool block=true); 01520 static rc_t flush_until(lsn_t& anlsn, bool block=true); 01521 01522 // Allowing to access info about the important lsns (curr and durable) 01523 static rc_t get_curr_lsn(lsn_t& anlsn); 01524 static rc_t get_durable_lsn(lsn_t& anlsn); 01525 01526 01527 /* 01528 Device and Volume Management 01529 ---------------------------- 01530 A device is either an operating system file or operating system 01531 device and is identified by a path name (absolute or relative). 01532 A device has a quota. In theory, a device may have 01533 multiple volumes on it but 01534 in the current implementation the maximum number of volumes 01535 is 1. 01536 01537 A volume is where data is stored. A volume is identified 01538 uniquely and persistently by a long volume ID (lvid_t). 01539 Volumes can be used whenever the device they are located 01540 on is mounted by the SM. Volumes have a quota. The 01541 sum of the quotas of all the volumes on a device cannot 01542 exceed the device quota. 01543 01544 The basic steps to begin using a new device/volume are: 01545 format_dev: initialize the device 01546 mount_dev: allow use of the device and all its volumes 01547 generate_new_lvid: generate a unique ID for the volume 01548 create_vol: create a volume on the device 01549 */ 01550 01551 /* 01552 * Device management functions 01553 */ 01554 /**\addtogroup SSMVOL 01555 * The storage manager was designed to permit multiple \e volumes 01556 * on a \e device, with \e volume analogous to a Unix \e parition and 01557 * a \e device analogous to a disk, and the original SHORE contained 01558 * symmetric peer servers. 01559 * However good that intention, multiple volumes on a device were never 01560 * implemented, and times have changed, and the storage manager no 01561 * longer has any notion of remote and local volumes. 01562 * The notion a volume, separate from a device, remains, but may 01563 * some day disappear. 01564 * 01565 * For the time being, a device contains at most one volume. 01566 * 01567 * A device is either an operating system file or 01568 * an operating system device (e.g., raw disk partition) and 01569 * is identified by a path name (absolute or relative). 01570 * 01571 * A device has a quota. 01572 * A device is intended to have multiple volumes on it, but 01573 * in the current implementation the maximum number of volumes 01574 * is exactly 1. 01575 * 01576 * A volume is where data are stored. 01577 * Each volume is a header and a set of pages. All pages are 01578 * the same size (this is a compile-time constant, the default being 01579 * 8K and sizes up to 64K permissible). 01580 * 01581 * A volume is identified uniquely and persistently by a 01582 * long volume ID (lvid_t), which is stored in its header. 01583 * Volumes can be used whenever the device they are located 01584 * on is mounted by the SM. 01585 * Volumes have a quota. The 01586 * sum of the quotas of all the volumes on a device cannot 01587 * exceed the device quota. 01588 * 01589 * A volume contains a variety of data structures. All user 01590 * data reside in \e stores. A store is a collection of the 01591 * pages on the volume, allocated in \e extents of a size that 01592 * is a compile-time constant. (The storage manager has only 01593 * been tested with an extent-size of 8 pages. The compile-time constant 01594 * can be changed, but it also requires changes elsewhere in the code 01595 * to maintain alignment of persistent structures. 01596 * See the comments in config/shore.def.) Thus, the minimum size 01597 * of a store is one extent's worth of pages. 01598 * Larger extents provide better clustering, but more wasted space if 01599 * small files and small indexes will be common. 01600 * 01601 * Stores are identified by a store number (snum_t). 01602 * 01603 * Each volume contains a few stores that are "overhead": 01604 * 0 -- is reserved for an extent map and a store map 01605 * 1 -- directory (dir_m) 01606 * 2 -- root index 01607 * 01608 * Beyond that, for each (user) file created, 2 stores are used, one for 01609 * small objects, one for large objects, and for each index (btree, rtree) 01610 * created 1 store is used. 01611 * 01612 * Each volume is laid out thus: 01613 * - volume header, which identifies the number of extents on 01614 * the volume, determined when the volume is formatted. 01615 * This is always in page 1 of the volume. 01616 * - store map: some number of pages describing the stores on the volume, 01617 * namely, being the heads of linked-lists of extents that make up 01618 * the stores. The number of such pages is determined when the 01619 * volume is formatted. The worst case is assumed, which is one 01620 * might fill the volume with one-extent stores. 01621 * - extent map: some number of pages of bitmaps, one bitmap for each 01622 * extent, describe which pages in the extents are allocated or free. 01623 * - data pages: the rest of the volume. 01624 * 01625 */ 01626 01627 /**\brief Format a device. 01628 * \ingroup SSMVOL 01629 * \details 01630 * @param[in] device Operating-system file name of the "device". 01631 * @param[in] quota_in_KB Quota in kilobytes. 01632 * @param[in] force If true, format the device even if it already exists. 01633 * 01634 * Since raw devices always "exist", \a force should be given as true 01635 * for raw devices. 01636 * 01637 * A device may not be formatted if it is already mounted. 01638 * 01639 * \note This method should \b not 01640 * be called in the context of a transaction. 01641 */ 01642 static rc_t format_dev( 01643 const char* device, 01644 smksize_t quota_in_KB, 01645 bool force); 01646 01647 /**\brief Mount a device. 01648 * \ingroup SSMVOL 01649 * \details 01650 * @param[in] device Operating-system file name of the "device". 01651 * @param[out] vol_cnt Number of volumes on the device. 01652 * @param[out] devid A local device id assigned by the storage manager. 01653 * @param[in] local_vid A local handle to the (only) volume on the device, 01654 * to be used when a volume is mounted. The default, vid_t::null, 01655 * indicates that the storage manager can chose a value for this. 01656 * 01657 * \note It is fine to mount a device more than once, as long as device 01658 * is always the same (you cannot specify a hard link or soft link to 01659 * an entity mounted under a different path). 01660 * Device mounts are \b not reference-counted, so a single dismount_dev 01661 * renders the volumes on the device unusable. 01662 * 01663 * \note This method should \b not 01664 * be called in the context of a transaction. 01665 */ 01666 static rc_t mount_dev( 01667 const char* device, 01668 u_int& vol_cnt, 01669 devid_t& devid, 01670 vid_t local_vid = vid_t::null); 01671 01672 /**\brief Dismount a device. 01673 * \ingroup SSMVOL 01674 * \details 01675 * @param[in] device Operating-system file name of the "device". 01676 * 01677 * \note It is fine to mount a device more than once, as long as device 01678 * is always the same (you cannot specify a hard link or soft link to 01679 * an entity mounted under a different path). 01680 * Device mounts are \b not reference-counted, so a single dismount_dev 01681 * renders the volumes on the device unusable. 01682 * 01683 * \note This method should \b not 01684 * be called in the context of a transaction. 01685 */ 01686 01687 static rc_t dismount_dev(const char* device); 01688 01689 /**\brief Dismount all mounted devices. 01690 * \ingroup SSMVOL 01691 * 01692 * \note This method should \b not 01693 * be called in the context of a transaction. 01694 */ 01695 static rc_t dismount_all(); 01696 01697 // list_devices returns an array of char* pointers to the names of 01698 // all mounted devices. Note that the use of a char*'s is 01699 // a temporary hack until a standard string class is available. 01700 // the char* pointers are pointing directly into the device 01701 // mount table. 01702 // dev_cnt is the length of the list returned. 01703 // dev_list and devid_list must be deleted with delete [] by the 01704 // caller if they are not null (0). They should be null 01705 // if an error is returned or if there are no devices. 01706 /**\brief Return a list of all mounted devices. 01707 * \ingroup SSMVOL 01708 * \details 01709 * @param[out] dev_list Returned list of pointers directly into the mount table. 01710 * @param[out] devid_list Returned list of associated device ids. 01711 * @param[out] dev_cnt Returned number of entries in the two above lists. 01712 * 01713 * The storage manager allocates the arrays returned with new[], and the 01714 * caller must return these to the heap with delete[] if they are not null. 01715 * They will be null if an error is returned or if no devices are mounted. 01716 * 01717 * The strings to which dev_list[*] point are \b not to be deleted by 01718 * the caller. 01719 */ 01720 static rc_t list_devices( 01721 const char**& dev_list, 01722 devid_t*& devid_list, 01723 u_int& dev_cnt); 01724 01725 /**\brief Return a list of all volume on a device. 01726 * \ingroup SSMVOL 01727 * \details 01728 * @param[in] device Operating-system file name of the "device". 01729 * @param[out] lvid_list Returned list of pointers directly into the mount table. 01730 * @param[out] lvid_cnt Returned length of list lvid_list. 01731 * 01732 * The storage manager allocates the array lvid_list 01733 * with new[], and the 01734 * caller must return it to the heap with delete[] if it is not null. 01735 * It will be null if an error is returned. 01736 * 01737 * \note This method should \b not 01738 * be called in the context of a transaction. 01739 */ 01740 static rc_t list_volumes( 01741 const char* device, 01742 lvid_t*& lvid_list, 01743 u_int& lvid_cnt 01744 ); 01745 01746 // get_device_quota the "quota" (in KB) of the device 01747 // and the amount of the quota allocated to volumes on the device. 01748 /**\brief Get the device quota. 01749 * \ingroup SSMVOL 01750 * \details 01751 * @param[in] device Operating-system file name of the "device". 01752 * @param[out] quota_KB Returned quota in kilobytes 01753 * @param[out] quota_used_KB Returned portion of quota allocated to volumes 01754 * 01755 * The quota_used_KB is the portion of the quota allocated to volumes on the device. 01756 * 01757 * \note This method \b may 01758 * be called in the context of a transaction. 01759 * 01760 * \note This method \b may 01761 * be called in the context of a transaction. 01762 */ 01763 static rc_t get_device_quota( 01764 const char* device, 01765 smksize_t& quota_KB, 01766 smksize_t& quota_used_KB); 01767 01768 01769 /* 01770 * Volume management functions 01771 */ 01772 01773 /**\brief Change the fake disk latency before I/Os on this volume, 01774 * for debugging purposes 01775 * \ingroup SSMVOL 01776 * \details 01777 * @param[in] vid The ID of the volume of interest. 01778 * @param[in] adelay Nanoseconds to sleep with ::nanosleep() 01779 * 01780 * This is for debugging only. 01781 * Changing the value of the latency for a volume does not enable the 01782 * delay. 01783 */ 01784 static rc_t set_fake_disk_latency(vid_t vid, const int adelay); 01785 01786 /**\brief Enable the fake disk latency before I/Os on this volume, for debugging purposes 01787 * \ingroup SSMVOL 01788 * \details 01789 * @param[in] vid The ID of the volume of interest. 01790 * 01791 * This is for debugging only. 01792 * When this is enabled, is uses whatever disk latency was set with 01793 * ss_m::create_vol() or the last applied ss_m::set_fake_disk_latency(). 01794 */ 01795 static rc_t enable_fake_disk_latency(vid_t vid); 01796 /**\brief Disable the fake disk latency before I/Os on this volume, for debugging purposes 01797 * \ingroup SSMVOL 01798 * \details 01799 * @param[in] vid The ID of the volume of interest. 01800 * 01801 * This is for debugging only. 01802 */ 01803 static rc_t disable_fake_disk_latency(vid_t vid); 01804 01805 01806 /**\brief Add a volume to a device. 01807 * \ingroup SSMVOL 01808 * \details 01809 * @param[in] lvid Long volume id to be used on ss_m::create_vol(). 01810 * 01811 * This generates a unique volume identifier to be written persistently 01812 * on the volume when it is formatted. 01813 * This enables us to avoid the mistake of doubly-mounting a volume. 01814 * The identifer is constructed from the machine network address and the 01815 * time of day. 01816 */ 01817 static rc_t generate_new_lvid(lvid_t& lvid); 01818 01819 /**\brief Add a volume to a device. 01820 * \ingroup SSMVOL 01821 * \details 01822 * @param[in] device_name Operating-system file name of the "device". 01823 * @param[in] lvid Long volume id to use when formatting the new volume. 01824 * @param[in] quota_KB Quota in kilobytes. 01825 * @param[in] skip_raw_init Do not initialize the volume if on a raw device. 01826 * @param[in] local_vid Short volume id by which to refer to this volume. 01827 * If null, the storage manager will assign one. 01828 * @param[in] apply_fake_io_latency See ss_m::enable_fake_disk_latency() 01829 * @param[in] fake_disk_latency See ss_m::set_fake_disk_latency() 01830 * 01831 * \note This method should \b not 01832 * be called in the context of a transaction. 01833 * 01834 * The pages on the volume \b must be zeroed; you can only use 01835 * \a skip_raw_init = true if you have by some other means 01836 * already initialized the volume. 01837 */ 01838 static rc_t create_vol( 01839 const char* device_name, 01840 const lvid_t& lvid, 01841 smksize_t quota_KB, 01842 bool skip_raw_init = false, 01843 vid_t local_vid = vid_t::null, 01844 const bool apply_fake_io_latency = false, 01845 const int fake_disk_latency = 0); 01846 01847 /**\brief Destroy a volume. 01848 * \ingroup SSMVOL 01849 * \details 01850 * @param[in] lvid Long volume id by which the volume is known. 01851 * 01852 * \note This method should \b not 01853 * be called in the context of a transaction. 01854 */ 01855 static rc_t destroy_vol(const lvid_t& lvid); 01856 01857 /**\brief Gets the quotas associated with the volume. 01858 * \ingroup SSMVOL 01859 * @param[in] lvid Long volume id by which the volume is known. 01860 * @param[out] quota_KB Quota given when the volume was created. 01861 * @param[out] quota_used_KB Portion of the quota has been used by 01862 * allocated extents. 01863 */ 01864 static rc_t get_volume_quota( 01865 const lvid_t& lvid, 01866 smksize_t& quota_KB, 01867 smksize_t& quota_used_KB); 01868 01869 /**\cond skip */ 01870 // check_volume_page_types: strictly for debugging/testing 01871 static rc_t check_volume_page_types(vid_t vid); 01872 /**\endcond skip */ 01873 01874 01875 /**\brief Analyze a volume and report statistics regarding disk usage. 01876 * \ingroup SSMVOL 01877 * @param[in] vid The volume of interest. 01878 * @param[out] du The structure that will hold the collected statistics. 01879 * @param[in] audit If "true", the method acquires a share lock on the 01880 * volume and then will check assertions about the 01881 * correctness of the data structures on the volume. 01882 * If the audit fails an internal fatal error is generated 01883 * to facilitate debugging. (It will generate a core file if your 01884 * shell permits such.) 01885 * If "false" an IS lock is acquired, which means that the 01886 * statistics will be fuzzy. 01887 * 01888 * Using the audit feature is useful for debugging. 01889 * It is the only safe way to use this method. 01890 * \note The statistics are added to the sm_du_stats_t structure passed in. 01891 * This structure is not cleared by the storage manager. 01892 */ 01893 static rc_t get_du_statistics( 01894 vid_t vid, 01895 sm_du_stats_t& du, 01896 bool audit = true); 01897 01898 /**\brief Analyze a store and report statistics regarding disk usage. 01899 * \ingroup SSMVOL 01900 * @param[in] stid The store of interest. 01901 * @param[out] du The structure that will hold the collected statistics. 01902 * @param[in] audit If "true", the method acquires a share lock on the 01903 * store and then will check assertions about the 01904 * correctness of the data structures on the store. 01905 * 01906 * Using the audit feature is useful for debugging. 01907 * It is the only safe way to use this method. 01908 * 01909 */ 01910 static rc_t get_du_statistics( 01911 const stid_t& stid, 01912 sm_du_stats_t& du, 01913 bool audit = true); 01914 01915 /**\brief Dump disk information about the indicated volume. 01916 * \ingroup SSMVOL 01917 * @param[in] vid The volume of interest. 01918 * 01919 * This function is for debugging. 01920 * It dumps, to the error log, at info_prio priority, 01921 * metadata about the given volume, including the number of extents 01922 * on the volume, the extent size, and the number of pages dedicated 01923 * to store maps and extent maps. Then, for each store on the volume, 01924 * it dumps the status of the store and the extents allocated to 01925 * that store. 01926 * 01927 * This function must be run in a transaction, though the function 01928 * is read-only. 01929 */ 01930 static rc_t dump_vol_store_info(const vid_t &vid); 01931 01932 /**\brief Analyze a volume and collect brief statistics about its usage. 01933 * \ingroup SSMVOL 01934 * @param[in] vid The volume of interest. 01935 * @param[out] volume_stats The statistics are written here. 01936 * @param[in] cc Indicates whether the volume is to be locked 01937 * by this method. Acceptable values are t_cc_none and t_cc_volume. 01938 * 01939 * If no lock is acquired, the method can fail with eRETRY. 01940 * 01941 */ 01942 static rc_t get_volume_meta_stats( 01943 vid_t vid, 01944 SmVolumeMetaStats& volume_stats, 01945 concurrency_t cc = t_cc_none 01946 ); 01947 01948 /**\brief Analyze a volume and collect brief statistics about its usage. 01949 * \ingroup SSMVOL 01950 * @param[in] vid The volume of interest. 01951 * @param[in] num_files The size of the array file_stats. 01952 * @param[out] file_stats Preallocated array of structs into which to 01953 * write the statistics for the individual files inspected. 01954 * @param[in] batch_calculate True means make one pass over the volume. 01955 * @param[in] cc Indicates whether the volume is to be locked 01956 * by this method. Acceptable values are t_cc_none and t_cc_volume. 01957 * 01958 * If no lock is acquired and batch_calculate is not set, 01959 * the method can fail with eRETRY. 01960 * 01961 * 01962 * If batch_calculate is true then this works by making one pass 01963 * over the meta data, but it looks at all the meta data. This 01964 * should be the faster way to do the analysis when there are 01965 * many files, and when files use a large portion of the volume. 01966 * 01967 * If batch_calculate is false then each file is updated 01968 * indidually, only looking at the extent information for that 01969 * particular file. This requires a pass over the volume for each 01970 * file. (Seek-wise it is less efficient). 01971 * 01972 */ 01973 static rc_t get_file_meta_stats( 01974 vid_t vid, 01975 w_base_t::uint4_t num_files, 01976 SmFileMetaStats* file_stats, 01977 bool batch_calculate = false, 01978 concurrency_t cc = t_cc_none 01979 ); 01980 01981 /**\brief Get the index ID of the root index of the volume. 01982 * \ingroup SSMVOL 01983 * 01984 * @param[in] v Volume of interest. 01985 * @param[out] iid Store ID of the root index. 01986 * \details 01987 * 01988 * Each volume has a root index, which is a well-known 01989 * index available to the server for bootstrapping a database. 01990 * 01991 */ 01992 static rc_t vol_root_index( 01993 const vid_t& v, 01994 stid_t& iid 01995 ) { iid.vol = v; iid.store = store_id_root_index; return RCOK; } 01996 01997 /***************************************************************** 01998 * storage operations: smfile.cpp 01999 *****************************************************************/ 02000 /**\addtogroup SSMSTORE 02001 * Indexes and files are special cases of "stores". 02002 * A store is a linked list of extents, and an extent is a 02003 * contiguous group of pages. So the store is the structure 02004 * that holds together an ordered set of pages that can be 02005 * used by a server and have an identifier (a store ID or stid_t). 02006 * 02007 * Indexes and files of records are built on stores. 02008 * 02009 * Stores have logging properties and 02010 * other metadata associated with them. 02011 * 02012 * The property that determines the logging level of the store is 02013 * \ref sm_store_property_t. 02014 * 02015 * Methods that let you get and change the metatdata are: 02016 * - ss_m::get_store_property 02017 * - ss_m::set_store_property 02018 * - ss_m::get_store_info 02019 * - \ref snum_t 02020 * 02021 * When a transaction deletes a file or index, the deletion of the 02022 * underlying stores is delayed until the transaction commits so that 02023 * the pages allocated to the stores remain reserved (lest the 02024 * transaction aborts). The deleting transaction could, in theory, 02025 * reuse the pages for another store, but in practice that is not done. 02026 * Instead, when a store is deleted, the store is marked 02027 * for deletion an put in a list for the transaction to delete upon 02028 * commit. At commit time, stores that have property t_load_file 02029 * or t_insert_file are converted to t_regular. 02030 */ 02031 02032 /**\brief Change the store property of a file or index. 02033 * \ingroup SSMSTORE 02034 * @param[in] stid File ID or index ID of the store to change. 02035 * @param[in] property Enumeration store_property_t (alias for 02036 * smlevel_3::sm_store_property_t, q.v.) 02037 * 02038 * \details 02039 * The possible uses of store properties are described with 02040 * smlevel_3::sm_store_property_t. 02041 */ 02042 static rc_t set_store_property( 02043 stid_t stid, 02044 store_property_t property 02045 ); 02046 02047 /**\brief Get the store property of a file or index. 02048 * \ingroup SSMSTORE 02049 * @param[in] stid File ID or index ID of the store of interest. 02050 * @param[in] property Reference to enumeration store_property_t 02051 * (alias for smlevel_3::sm_store_property_t, q.v.) 02052 * 02053 * \details 02054 * The possible uses of store properties are described with 02055 * smlevel_3::sm_store_property_t. 02056 */ 02057 static rc_t get_store_property( 02058 stid_t stid, 02059 store_property_t& property); 02060 02061 /**\brief Get various store information of a file or index. 02062 * \ingroup SSMSTORE 02063 * @param[in] stid File ID or index ID of the store of interest. 02064 * @param[out] info Reference to sm_store_info_t into which to 02065 * write the results. 02066 * 02067 * \details 02068 * Get internally stored information about a store. 02069 */ 02070 static rc_t get_store_info( 02071 const stid_t& stid, 02072 sm_store_info_t& info); 02073 02074 // 02075 // Functions for B+tree Indexes 02076 // 02077 /**\addtogroup SSMBTREE 02078 * The storage manager supports B+-Tree indexes provide associative access 02079 * to data by associating keys with values in 1:1 or many:1 relationships. 02080 * Keys may be composed of any of the basic C-language types (integer, 02081 * unsigned, floating-point of several sizes) or 02082 * variable-length character strings (wide characters are \b not supported). 02083 * 02084 * The number of key-value pairs that an index can hold is limited by the 02085 * space available on the volume containing the index. 02086 * \anchor max_entry_size 02087 * The combined sizes of the key and value must 02088 * be less than or equal to \ref max_entry_size, which is 02089 * a function of the page size, and is 02090 * such that two entries of this size fit on a page along with all 02091 * the page and entry metadata. See sm_config_info_t and ss_m::config_info. 02092 * 02093 * The minimum size of a B-Tree index is 8 pages (1 extent). 02094 * 02095 * A variety of locking protocols is supported: 02096 * - none : acquire no locks on the {key,value} pairs in the index, 02097 * although an intention lock might be acquired on the index. 02098 * - kvl : key-value locking See \ref MOH1. The key or 02099 * key-value pair is hashed into a 4-byte value and used with the 02100 * given store id to make a lock id. 02101 * - im : index-management locking See \ref MOH1. 02102 * The "value" portion of 02103 * the key-value lock is taken to be a record id, which is used 02104 * for the lock id. 02105 * - modified kvl : an ad-hoc protocol used by the Paradise project. See \ref MODKVL "the scan_index_i constructor". As with index-management locking, 02106 * the "value" portion of 02107 * the key-value lock is taken to be a record id, which is used 02108 * for the lock id. 02109 * - file : full-index locking. 02110 * 02111 * \section key_description Key Types 02112 * A B+-Tree index key has a type determined when the index is created. 02113 * All keys are stored in lexicographic format based on an interpretation of 02114 * the key determined by the key description given when the index is 02115 * created. 02116 * Lookups on the B+-Tree then involve a single byte-by-byte 02117 * comparison of two byte-strings, each composed of its concatenated 02118 * sub-keys. 02119 * 02120 * The key description is a null-terminated string as follows: 02121 \verbatim 02122 <key_decription> ::= <fixed_len_part>* <variable_len_part> | 02123 <fixed_len_part>+ 02124 <fixed_len_part> ::= <type> <len> 02125 <variable_len_part> ::= <type> '*' <len> 02126 <type> ::= 'i' | 'u' | 'f' | 'b' | 'I' | 'U' | 'F' | 'B' 02127 <len> ::= [1-9][0-9]* 02128 \endverbatim 02129 * Thus, a key may have any number of fixed-length parts followed by at 02130 * most one variable-length part. 02131 * 02132 * The fixed-length parts (if present) consist of a type and a length. 02133 * 02134 * The variable-length part (if present) consists of a type and a length 02135 * separated by an asterisk, which is what distinguishes a variable-length 02136 * from a fixed-length part. 02137 * 02138 * Types and permissible lengths are: 02139 * - integer (1,2,4,8) 02140 * - unsigned (1,2,4,8) 02141 * - floating (4,8) 02142 * - uninterpreted byte (any length greater than zero) 02143 * 02144 * A capital letter indicates that the key part may be compressed. Only prefix 02145 * compression is implemented, so it makes sense to compress if the 02146 * first part of the key is compressible. 02147 * 02148 * Examples: 02149 * - "B40u4u2u2" : 40-byte character string followed by a 4-byte integer, 02150 * a 2-byte integer and a 2-byte integer, such as one might 02151 * use for name.year.mo.day. The character string is 02152 * prefix-compressed. 02153 * - "f8" : an 8-byte floating-point number (double) 02154 * - "I8B*1000" : An 8-byte integer followed by an uninterpreted string 02155 * of up to 1000 bytes, all prefix-compressed. 02156 * 02157 * \note Wide characters are not supported. 02158 * 02159 * This key descriptor is stored in the sm_store_info_t, which is 02160 * stored on the volume and is available with the method ss_m::get_store_info. 02161 * Keys are stored in \ref LEXICOFORMAT "lexicographic format". The 02162 * storage manager knows how to convert all the key types listed above. 02163 * When duplicates are permitted, the index assumes that the elements 02164 * are in lexicographic order when searching for a <key,element> pair. 02165 * 02166 * \section XXXX1 Bulk Loading 02167 * Bulk-loading of all index types is supported. See \ref SSMBULKLD. 02168 */ 02169 02170 02171 /**\brief Create a B+-Tree index. 02172 * \ingroup SSMBTREE 02173 * @param[in] vid Volume on which to create the index. 02174 * @param[in] ntype Type of index. Legitimate values are: 02175 * - t_btree : B+-Tree with duplicate keys allowed 02176 * - t_uni_btree : B+-Tree without duplicate keys 02177 * @param[in] property Logging level of store. Legitimate values are: 02178 * - t_regular 02179 * - t_load_file 02180 * - t_insert_file 02181 * See sm_store_property_t for details. 02182 * @param[in] key_desc Description of key type. 02183 * See \ref key_description for details. 02184 * @param[in] cc The locking protocol to use with this index. See 02185 * smlevel_0::concurrency_t and \ref SSMBTREE. 02186 * @param[out] stid New store ID will be returned here. 02187 */ 02188 static rc_t create_index( 02189 vid_t vid, 02190 ndx_t ntype, 02191 store_property_t property, 02192 const char* key_desc, 02193 concurrency_t cc, 02194 stid_t& stid 02195 ); 02196 02197 /**\brief Create a B+-Tree or R*-Tree index. 02198 * \ingroup SSMBTREE 02199 *\attention For backward compatibility. Will be deprecated later. 02200 */ 02201 static rc_t create_index( 02202 vid_t vid, 02203 ndx_t ntype, 02204 store_property_t property, 02205 const char* key_desc, 02206 stid_t& stid 02207 ); 02208 02209 /**\brief Destroy a B+-Tree index. 02210 * \ingroup SSMBTREE 02211 * 02212 * @param[in] iid ID of the index to be destroyed. 02213 */ 02214 static rc_t destroy_index(const stid_t& iid); 02215 02216 /**\brief Bulk-load a B+-Tree index from multiple data sources. 02217 * \ingroup SSMBULKLD 02218 * 02219 * @param[in] stid ID of the index to be loaded. 02220 * @param[in] nsrcs Number of files used for data sources. 02221 * @param[in] source Array of IDs of files used for data sources. 02222 * @param[out] stats Statistics concerning the load activity will be 02223 * written here. 02224 * @param[in] sort_duplicates If "true" the bulk-load will sort 02225 * duplicates by value. 02226 * @param[in] lexify_keys If "true" the keys are assumed not to 02227 * be in 02228 * lexicographic format, and the bulk-load will reformat the key before 02229 * storing it in the index, 02230 * otherwise they are assumed already to be in lexicographic format. 02231 * 02232 * \anchor LEXICOFORMAT 02233 * \b Lexicographic \b format 02234 * is the translation of numbers 02235 * (int, float, double, unsigned, etc) into byte strings 02236 * such that a lexicographic comparison of the byte strings 02237 * yields the same result as the numeric comparison of the 02238 * original data. 02239 * 02240 * \note The data must already have been sorted by 02241 * key in lexicographic format, but the keys themselves don't have 02242 * to be in lexicographic format; if the keys are not already in 02243 * lexicographic format, the \a lexify_keys must be given the value "true". 02244 * 02245 * In the case of duplicate keys, the bulk-load will handle the 02246 * sorting of the elements if \a sort_duplicates is "true"; this 02247 * sort will be done by a lexicographic comparison of the 02248 * byte strings that compose the elements. 02249 */ 02250 static rc_t bulkld_index( 02251 const stid_t& stid, 02252 int nsrcs, 02253 const stid_t* source, 02254 sm_du_stats_t& stats, 02255 bool sort_duplicates = true, 02256 bool lexify_keys = true 02257 ); 02258 /**\brief Bulk-load a B+-Tree index from a single data source. 02259 * \ingroup SSMBULKLD 02260 * 02261 * @param[in] stid ID of the index to be loaded. 02262 * @param[in] source IDs of file used for data source. 02263 * @param[out] stats Statistics concerning the load activity will be 02264 * written here. 02265 * @param[in] sort_duplicates If "true" the bulk-load will sort 02266 * duplicates by value. 02267 * @param[in] lexify_keys If "true" the keys are assumed not to 02268 * be in 02269 * lexicographic format, and the bulk-load will reformat the key before 02270 * storing it in the index, 02271 * otherwise they are assumed already to be in lexicographic format. 02272 */ 02273 static rc_t bulkld_index( 02274 const stid_t& stid, 02275 const stid_t& source, 02276 sm_du_stats_t& stats, 02277 bool sort_duplicates = true, 02278 bool lexify_keys = true 02279 ); 02280 /**\brief Bulk-load a B+-Tree index from a single data stream. 02281 * \ingroup SSMBULKLD 02282 * 02283 * @param[in] stid ID of the index to be loaded. 02284 * @param[in] sorted_stream Iterator that serves as the data source. 02285 * @param[out] stats Statistics concerning the load activity will be 02286 * written here. 02287 * 02288 * See sort_stream_i. 02289 */ 02290 static rc_t bulkld_index( 02291 const stid_t& stid, 02292 sort_stream_i& sorted_stream, 02293 sm_du_stats_t& stats); 02294 02295 /**\cond skip */ 02296 static rc_t print_index(stid_t stid); 02297 /**\endcond skip */ 02298 02299 /**\brief Create an entry in a B+-Tree index. 02300 * \ingroup SSMBTREE 02301 * 02302 * @param[in] stid ID of the index. 02303 * @param[in] key Key for the association to be created. 02304 * @param[in] el Element for the association to be created. 02305 * 02306 * The combined sizes of the key and element vectors must 02307 * be less than or equal to \ref max_entry_size. 02308 */ 02309 static rc_t create_assoc( 02310 stid_t stid, 02311 const vec_t& key, 02312 const vec_t& el 02313 #ifdef SM_DORA 02314 , const bool bIgnoreLocks = false 02315 #endif 02316 ); 02317 /**\brief Remove an entry from a B+-Tree index. 02318 * If your index is non-unique (i.e., it may contain 02319 * multiple entries per key), use destroy_all_assoc. 02320 * 02321 * \ingroup SSMBTREE 02322 * 02323 * @param[in] stid ID of the index. 02324 * @param[in] key Key of the entry to be removed. 02325 * @param[in] el Element (value) of the entry to be removed. 02326 */ 02327 static rc_t destroy_assoc( 02328 stid_t stid, 02329 const vec_t& key, 02330 const vec_t& el 02331 #ifdef SM_DORA 02332 , const bool bIgnoreLocks = false 02333 #endif 02334 ); 02335 /**\brief Destroy all entries associated with a key in a B+-Tree index. 02336 * \ingroup SSMBTREE 02337 * 02338 * @param[in] stid ID of the index. 02339 * @param[in] key Key of the entries to be removed. 02340 * @param[out] num_removed The number of entries removed is returned here. 02341 */ 02342 static rc_t destroy_all_assoc( 02343 stid_t stid, 02344 const vec_t& key, 02345 int& num_removed 02346 ); 02347 /**\brief Find an entry associated with a key in a B+-Tree index. 02348 * \ingroup SSMBTREE 02349 * 02350 * @param[in] stid ID of the index. 02351 * @param[in] key Key of the entries to be removed. 02352 * @param[out] el Element associated with the given key will be copied into this buffer. 02353 * @param[in] elen Length of buffer into which the 02354 * result will be written. If too small, eRECWONTFIT will 02355 * be returned. 02356 * Length of result will be returned here. 02357 * @param[out] found True if an entry is found. 02358 * 02359 * If the index is not unique (allows duplicates), the first 02360 * element found with the given key will be returned. 02361 * 02362 * To locate all entries associated with a non-unique key, you must 02363 * use scan_index_i, q.v.. 02364 */ 02365 static rc_t find_assoc( 02366 stid_t stid, 02367 const vec_t& key, 02368 void* el, 02369 smsize_t& elen, 02370 bool& found 02371 #ifdef SM_DORA 02372 , const bool bIgnoreLocks = false 02373 #endif 02374 ); 02375 02376 // 02377 // Functions for R*tree (multi-dimensional(MD), spatial) Indexes 02378 // 02379 02380 /**\addtogroup SSMRTREE 02381 * 02382 * An R-tree is a height-balanced structure designed for indexing 02383 * multi-dimensional spatial objects. 02384 * It stores the minimial bounding box (with 2 or higher dimension) of 02385 * a spatial object as the key in the leaf pages. 02386 * This implementation is a variant of an R-Tree called an R*-Tree, which 02387 * improves the search performance by using a heuristic for redistributing 02388 * entries and dynamically reorganizing the tree during insertion. 02389 * 02390 * An R*-Tree stores key,value pairs where the key is of type nbox_t 02391 * and the value is of type vec_t. 02392 * 02393 * The number of key-value pairs an index can hold is limited by the space 02394 * available on the volume containing the index. 02395 * The minimum size of an R*-tree index is 8 pages. 02396 * 02397 * 02398 * \note This implementation 02399 * uses coarse-grained (index-level) locking and 02400 * supports only 2 dimensions and integer coordinates. 02401 * For information about R*-trees, see the \ref BKSS. 02402 * 02403 * Example: 02404 * \code 02405 scan_rt_i scan(idx, nbox_t::t_overlap, universe, true); 02406 bool eof; 02407 nbox_t k; 02408 char* e; 02409 smsize_t elen; 02410 02411 for(int i=0; 02412 (!(rc = scanp->next(k,e,elen,eof)).is_error() && !eof); 02413 i++) ; 02414 cout << "Rtree " << idx << " contains " << i << " entries." << endl; 02415 \endcode 02416 * 02417 * 02418 * \section XXXX2 Bulk Loading 02419 * Bulk-loading of all index types is supported. See \ref SSMBULKLD. 02420 */ 02421 /*\example rtree_example.cpp*/ 02422 02423 02424 /**\brief Create an R*-Tree (multi-dimensional spatial) index. 02425 * The storage manager does not provide 02426 * complete support for non-unique multidimensional indexes. 02427 * While you may insert multiple (distinct) entries for the same key in 02428 * a multi-dimensional index, you will not be able to use them; only 02429 * the first can be retrieved. 02430 * \ingroup SSMRTREE 02431 * @param[in] vid Volume on which to create the index. 02432 * @param[in] ntype Type of index. Legitimate values are: 02433 * - t_rtree : R*-Tree 02434 * @param[in] property Logging level of store. Legitimate values are: 02435 * - t_temporary 02436 * - t_regular 02437 * - t_load_file 02438 * - t_insert_file 02439 * See sm_store_property_t for details. 02440 * @param[in] dim Number of dimensions of the key. 02441 * They key type is an nbox_t. 02442 * See \ref nbox_t for details. 02443 * @param[out] stid New store ID will be returned here. 02444 */ 02445 static rc_t create_md_index( 02446 vid_t vid, 02447 ndx_t ntype, 02448 store_property_t property, 02449 stid_t& stid, 02450 int2_t dim = 2 02451 ); 02452 02453 /**\brief Destroy an R*-Tree index. 02454 * \ingroup SSMRTREE 02455 * 02456 * @param[in] iid ID of the index to be destroyed. 02457 */ 02458 static rc_t destroy_md_index(const stid_t& iid); 02459 02460 /**\brief Bulk-load a multi-dimensional index from multiple sources. 02461 * \ingroup SSMBULKLD 02462 * @param[in] stid ID of the index to be loaded. 02463 * @param[in] nsrcs Number of files used for data sources. 02464 * @param[in] source Array of IDs of files used for data sources. 02465 * @param[out] stats Statistics concerning the load activity will be 02466 * written here. 02467 * @param[in] hff Heuristic fill factor. Not used. 02468 * @param[in] hef Heuristic expansion factor. Not used. 02469 * @param[in] universe Universal bounding box of all spatial objects indexed. 02470 */ 02471 static rc_t bulkld_md_index( 02472 const stid_t& stid, 02473 int nsrcs, 02474 const stid_t* source, 02475 sm_du_stats_t& stats, 02476 int2_t hff=75, 02477 int2_t hef=120, 02478 nbox_t* universe=NULL); 02479 02480 /**\brief Bulk-load a multi-dimensional index from a single source. 02481 * The storage manager does not provide 02482 * complete support for non-unique multidimensional indexes. 02483 * While you may insert multiple (distinct) entries for the same key in 02484 * a multi-dimensional index, you will not be able to use them; only 02485 * the first can be retrieved. 02486 * \ingroup SSMBULKLD 02487 * @param[in] stid ID of the index to be loaded. 02488 * @param[in] source ID of file to be used for data source. 02489 * @param[out] stats Statistics concerning the load activity will be 02490 * written here. 02491 * @param[in] hff Heuristic fill factor. Not used. 02492 * @param[in] hef Heuristic expansion factor. Not used. 02493 * @param[in] universe Universal bounding box of all spatial objects indexed. 02494 */ 02495 static rc_t bulkld_md_index( 02496 const stid_t& stid, 02497 const stid_t& source, 02498 sm_du_stats_t& stats, 02499 int2_t hff=75, 02500 int2_t hef=120, 02501 nbox_t* universe=NULL); 02502 02503 /**\brief Bulk-load a multi-dimensional index from a sorted stream source. 02504 * The storage manager does not provide 02505 * complete support for non-unique multidimensional indexes. 02506 * While you may insert multiple (distinct) entries for the same key in 02507 * a multi-dimensional index, you will not be able to use them; only 02508 * the first can be retrieved. 02509 * \ingroup SSMBULKLD 02510 * @param[in] stid ID of the index to be loaded. 02511 * @param[in] sorted_stream Input stream that is data source. 02512 * @param[out] stats Statistics concerning the load activity will be 02513 * written here. 02514 * @param[in] hff Heuristic fill factor. Not used. 02515 * @param[in] hef Heuristic expansion factor. Not used. 02516 * @param[in] universe Universal bounding box of all spatial objects indexed. 02517 */ 02518 static rc_t bulkld_md_index( 02519 const stid_t& stid, 02520 sort_stream_i& sorted_stream, 02521 sm_du_stats_t& stats, 02522 int2_t hff=75, 02523 int2_t hef=120, 02524 nbox_t* universe=NULL); 02525 02526 /**\brief Print a representation of the rtree. 02527 * \ingroup SSMRTREE 02528 * @param[in] stid ID of the index to be printed. 02529 * @param[in] out I/O stream to which to write the output. 02530 */ 02531 static rc_t print_md_index(stid_t stid, ostream &out); 02532 02533 /**\brief Look up an entry in a multi-dimensional index. 02534 * \ingroup SSMRTREE 02535 * 02536 * @param[in] stid ID of the index. 02537 * @param[in] key Key associated with the entry to look up. 02538 * @param[out] el Element associated with the given key will be copied into this buffer. 02539 * @param[in] elen Length of buffer into which the 02540 * result will be written. If too small, eRECWONTFIT will 02541 * be returned. 02542 * Length of result will be returned here. 02543 * @param[out] found True if an entry is found. 02544 * 02545 * If the index is not unique (allows duplicates), the first 02546 * element found with the given key will be returned. 02547 * 02548 * The storage manager does not provide a method to locate all 02549 * entries associated with a non-unique key. 02550 */ 02551 static rc_t find_md_assoc( 02552 stid_t stid, 02553 const nbox_t& key, 02554 void* el, 02555 smsize_t& elen, 02556 bool& found); 02557 02558 /**\brief Create an entry in a multi-dimensional index. 02559 * The storage manager does not provide 02560 * complete support for non-unique multidimensional indexes. 02561 * While you may insert multiple (distinct) entries for the same key in 02562 * a multi-dimensional index, you will not be able to use them; only 02563 * the first can be retrieved. 02564 * \ingroup SSMRTREE 02565 * 02566 * @param[in] stid ID of the index. 02567 * @param[in] key Key for the association to be created. 02568 * @param[in] el Element for the association to be created. 02569 */ 02570 static rc_t create_md_assoc( 02571 stid_t stid, 02572 const nbox_t& key, 02573 const vec_t& el); 02574 02575 /**\brief Destroy an entry in a multi-dimensional index. 02576 * \ingroup SSMRTREE 02577 * 02578 * @param[in] stid ID of the index. 02579 * @param[in] key Key of the entry to be removed. 02580 * @param[in] el Element (value) of the entry to be removed. 02581 */ 02582 static rc_t destroy_md_assoc( 02583 stid_t stid, 02584 const nbox_t& key, 02585 const vec_t& el); 02586 02587 /**\cond skip */ 02588 // for debugging 02589 static rc_t draw_rtree(const stid_t& stid, ostream &); 02590 /**\endcond skip */ 02591 02592 /**\brief Gather usage statistics about an R*-Tree index. 02593 * \ingroup SSMRTREE 02594 * @param[in] stid ID of the index. 02595 * @param[out] stat Usage statistics will be written here. 02596 * @param[in] size Number of uint2_t's in the array ovp. 02597 * @param[out] ovp Pre-allocated array of integers into which 02598 * the method will write the overlap percentages for each level of the 02599 * tree. 02600 * @param[in] audit If "true", the method 02601 * will check assertions about the 02602 * correctness of the rtree. 02603 * If the audit fails an internal fatal error is generated 02604 * to facilitate debugging. (It will generate a core file if your 02605 * shell permits such.) 02606 * 02607 * \note for debugging 02608 */ 02609 static rc_t rtree_stats( 02610 const stid_t& stid, 02611 rtree_stats_t& stat, 02612 uint2_t size = 0, 02613 uint2_t* ovp = NULL, 02614 bool audit = false); 02615 02616 /**\addtogroup SSMFILE 02617 * You can create, destroy, and scan files of records. You may exert some 02618 * control over the order in which records appear in the file (a physical 02619 * scan), but, in general, the storage manager decides where to put records. 02620 * 02621 * Pages in a file are slotted pages: Each page contains an array of 02622 * slots. 02623 * Records take one of three forms: small, large, and very large. 02624 * - Small records fit in the slots on the file pages. 02625 * - Large records are too big to fit on a slotted page, so they are put 02626 * elsewhere, and the slots point to these records. Actually, what is 02627 * in a slot is a small array of page pointers to the data of the large record. 02628 * - A very large record is one whose slot in the file page contains 02629 * a single reference to a page that is an index of data pages. 02630 * 02631 * Because records may take these forms, the API for creating records 02632 * contains the opportunity for you to provide a hint about the ultimate 02633 * size of the record so that the storage manager can create the proper 02634 * structure for the record immediately, rather than creating a small 02635 * record that is soon to be converted to a large, then a very large record 02636 * by subsequent appends. 02637 * 02638 * All records contain a client-defined header. This is for the convenience 02639 * of server-writers. The header must fit on the slotted page, so it should 02640 * never be very large. 02641 * 02642 * The following methods manipulate files of records and the records found 02643 * there. 02644 * 02645 * Modules below describe file traversal and 02646 * appending to files (\ref SSMSCANF), 02647 * and pinning individual records in the buffer pool for extended operations 02648 * (\ref SSMPIN). 02649 * 02650 * \section UNINIT Uninitialized Data 02651 * The functions create_rec, append_rec, and update_rec can be used to 02652 * write blocks of data that are all zeroes, with minimal logging. 02653 * This is useful for creating records of known size but with uninitialized data. 02654 * The type zvec_t, a special case of vec_t, is for this purpose. 02655 * Construct it with only a size, as follows: 02656 * \code 02657 * zvec_t zdata(100000); 02658 * \endcode 02659 * The underlying logging code recognizes that this is a vector of zeroes and 02660 * logs only a count, not the data themselves. 02661 * 02662 * \section Errors 02663 * If an error occurs in the middle of one of these methods that is updating persistent data, 02664 * the record or file \e could be in an inconsistent state. 02665 * The caller has the choice of aborting the transaction or rolling back to the nearest savepoint (see \ref SSMXCT). 02666 * 02667 * \sa SSMSCAN, SSMPIN, vec_t, zvec_t, IDs. 02668 */ 02669 02670 /**\brief Create a file of records. 02671 * \ingroup SSMFILE 02672 * \details 02673 * @param[in] vid Volume on which to create a file. 02674 * @param[out] fid Returns (store) ID of the new file here. 02675 * @param[in] property Give the file the this property. 02676 * @param[in] cluster_hint Not used. 02677 * 02678 * The cluster hint is included in the API for future use. 02679 * It has no effect. 02680 */ 02681 static rc_t create_file( 02682 vid_t vid, 02683 stid_t& fid, 02684 store_property_t property, 02685 shpid_t cluster_hint = 0 02686 ); 02687 02688 /**\brief Destroy a file of records. 02689 * \ingroup SSMFILE 02690 * \details 02691 * @param[in] fid ID of the file to destroy. 02692 */ 02693 static rc_t destroy_file(const stid_t& fid); 02694 02695 /**\brief Create a new record. 02696 * \ingroup SSMFILE 02697 * \details 02698 * @param[in] fid ID of the file in which to create a record. 02699 * @param[in] hdr What to put in the record's header. 02700 * @param[in] len_hint Hint about how big the record will ultimately be. 02701 * This is used to determine the initial format of the record. If you plan 02702 * to append to the record and know that it will ultimately become a large 02703 * record, it is more efficient to give a size hint that is larger than 02704 * a page here. Otherwise, the record will be made small (as determined by 02705 * the size of the parameter \a data ), and subsequent appends will cause 02706 * the record to be converted to a large record. 02707 * @param[in] data What to put in the record's body. 02708 * @param[out] new_rid ID of the newly created record. 02709 * @param[in] policy File compaction policy to use. See \ref pg_policy_t 02710 * for possible values. 02711 */ 02712 static rc_t create_rec( 02713 const stid_t& fid, 02714 const vec_t& hdr, 02715 smsize_t len_hint, 02716 const vec_t& data, 02717 rid_t& new_rid, 02718 #ifdef SM_DORA 02719 const bool bIgnoreLocks = false, 02720 #endif 02721 uint4_t policy = t_cache | t_compact | t_append 02722 ); 02723 02724 /**\brief Destroy a record. 02725 * \ingroup SSMFILE 02726 * \details 02727 * @param[in] rid ID of the record to destroy. 02728 */ 02729 static rc_t destroy_rec(const rid_t& rid 02730 #ifdef SM_DORA 02731 , const bool bIgnoreLocks = false 02732 #endif 02733 ); 02734 02735 /**\brief Modify the body of an existing record. 02736 * \ingroup SSMFILE 02737 * \details 02738 * @param[in] rid ID of the record to modify. 02739 * @param[in] start First byte to change. 02740 * @param[in] data What to put in the record's body. 02741 * 02742 * This overwrites 02743 * the existing bytes, starting at the offset \a start through the 02744 * byte at \a start + \a data.size(). 02745 * This method \b cannot \b be \b used to change the size of a record. 02746 * Attempting this will result in an error. 02747 */ 02748 static rc_t update_rec( 02749 const rid_t& rid, 02750 smsize_t start, 02751 const vec_t& data); 02752 02753 /**\brief Modify the header of an existing record. 02754 * \ingroup SSMFILE 02755 * \details 02756 * @param[in] rid ID of the record to modify. 02757 * @param[in] start First byte to change. 02758 * @param[in] hdr What to put in the record's header. 02759 * 02760 * This overwrites 02761 * the existing bytes, starting at the offset \a start through the 02762 * byte at \a start + \a data.size(). 02763 * This method \b cannot \b be \b used to change the size of a record 02764 * header. There are no methods for appending to or truncating a 02765 * record header. 02766 * 02767 * \sa pin_i::update_rec, \ref SSMPIN 02768 */ 02769 static rc_t update_rec_hdr( 02770 const rid_t& rid, 02771 smsize_t start, 02772 const vec_t& hdr); 02773 // see also pin_i::update_rec*() 02774 02775 /**\brief Append bytes to a record body. 02776 * \ingroup SSMFILE 02777 * \details 02778 * @param[in] rid ID of the record to modify. 02779 * @param[in] data What to append to the record. 02780 * 02781 * \note This appends \b to a record; it does \b not append a record to a file! 02782 * \sa pin_i::append_rec, \ref SSMPIN 02783 */ 02784 static rc_t append_rec( 02785 const rid_t& rid, 02786 const vec_t& data 02787 ); 02788 02789 /**\brief Chop bytes off the end of a record body. 02790 * \ingroup SSMFILE 02791 * \details 02792 * @param[in] rid ID of the record to modify. 02793 * @param[in] amount How many bytes to lop off. 02794 * 02795 * \sa pin_i::truncate_rec, \ref SSMPIN 02796 */ 02797 static rc_t truncate_rec( 02798 const rid_t& rid, 02799 smsize_t amount 02800 ); 02801 02802 /**\brief Chop bytes off the end of a record body. 02803 * \ingroup SSMFILE 02804 * \details 02805 * @param[in] rid ID of the record to modify. 02806 * @param[in] amount How many bytes to lop off. 02807 * @param[out] should_forward Returns true if the record started out 02808 * large but is now small as a result of the truncation. 02809 * This enables a value-added server to take action in this event, 02810 * should it so desire. 02811 * 02812 * \sa pin_i::truncate_rec, \ref SSMPIN 02813 */ 02814 static rc_t truncate_rec( 02815 const rid_t& rid, 02816 smsize_t amount, 02817 bool& should_forward 02818 ); 02819 02820 #ifdef OLDSORT_COMPATIBILITY 02821 typedef ssm_sort::key_info_t key_info_t; 02822 02823 /* old sort physical version */ 02824 /**\brief Sort a file. Deprecated. 02825 * \details 02826 */ 02827 static rc_t sort_file( 02828 const stid_t& fid, 02829 vid_t vid, 02830 stid_t& sfid, 02831 store_property_t property, 02832 const key_info_t& key_info, 02833 int run_size, 02834 bool ascending = true, 02835 bool unique = false, 02836 bool destructive = false, 02837 bool use_new_sort = true); 02838 02839 /**\brief Sort a file. Deprecated. 02840 * \details 02841 */ 02842 static rc_t new_sort_file( 02843 const stid_t& fid, 02844 vid_t vid, 02845 stid_t& sfid, 02846 store_property_t property, 02847 const key_info_t& key_info, 02848 int run_size, 02849 bool ascending = true, 02850 bool unique = false, 02851 bool destructive = false 02852 ); 02853 #endif /* OLDSORT_COMPATIBILITY */ 02854 02855 typedef ssm_sort::sort_keys_t sort_keys_t; 02856 02857 /* new sort physical version : see notes below */ 02858 /**\brief Sort a file. 02859 * \ingroup SSMSORT 02860 * @param[in] fid File to sort. 02861 * @param[in] sorted_fid File to which to write the results. 02862 * @param[in] nvids Size of array \a vid. 02863 * @param[in] vid Array of IDs of scratch files created by the caller. 02864 * @param[in] kl See sort_keys_t. 02865 * @param[in] min_rec_sz Hint of minimum record size in input file. 02866 * @param[in] run_size Number of pages in buffer pool to use for a run. 02867 * @param[in] temp_space Number of pages to use for scratch space. 02868 * (This limits the amount of memory used by the sort). 02869 * 02870 * \details 02871 * Before you call sort_file, you must create an output file \a sorted_fid 02872 * into which sort_file will write the results. 02873 * 02874 * The sort uses temporary files when the input file contains more records 02875 * than can fit in one run (determined by \a run_size). These temporary files 02876 * may be spread across multiple volumes, which is useful if the 02877 * volumes reside on different spindles. The arguments \a nvids 02878 * and \a vid are for indicating the volumes to use for these scratch 02879 * files. 02880 * 02881 * The caller can provide a clue in \a min_rec_size 02882 * about the minimum record size of the 02883 * input file, which can help the sort's efficiency. 02884 * 02885 * The \a run_size indicates how many buffer-pool pages to use 02886 * for each run. 02887 * Since at all times one page is fixed for output, while the rest are 02888 * for reading the input in runs, the real run size is \a run_size-1. 02889 * 02890 */ 02891 static rc_t sort_file( 02892 const stid_t& fid, // input file 02893 const stid_t& sorted_fid, // output file 02894 int nvids, // array size for vids 02895 const vid_t* vid, // array of vids for temp 02896 // files 02897 // created by caller-- 02898 // can be same as input file 02899 sort_keys_t& kl, // kl & 02900 smsize_t min_rec_sz, // for estimating space use 02901 int run_size, // # pages to use for a run 02902 int temp_space // # pages VM to use for scratch 02903 ); 02904 02905 /**\brief Return the short volume ID of a volume. 02906 * \ingroup SSMVOL 02907 * 02908 * @param[in] lvid Long (persistent) volume ID found on the volume's 02909 * header. 02910 * @param[out] vid Short volume ID of a mounted volume. 02911 */ 02912 static rc_t lvid_to_vid( 02913 const lvid_t& lvid, 02914 vid_t& vid); 02915 02916 /**\brief Return the long volume ID of a volume. 02917 * \ingroup SSMVOL 02918 * 02919 * @param[in] vid Short volume ID of a mounted volume. 02920 * @param[out] lvid Long (persistent) volume ID found on the volume's 02921 * header. 02922 */ 02923 static rc_t vid_to_lvid( 02924 vid_t vid, 02925 lvid_t& lvid); 02926 02927 /***************************************************************** 02928 * Locking related functions 02929 * 02930 * NOTE: there are standard conversions from lpid_t, rid_t, and 02931 * stid_t to lockid_t, so wherever a lockid_t parameter is 02932 * specified a lpid_t, rid_t, or stid_t can be used. 02933 * 02934 *****************************************************************/ 02935 02936 #ifdef SLI_HOOKS 02937 /* enable/disable SLI globally for all threads created after this 02938 point. Does *NOT* disable SLI for existing threads. 02939 */ 02940 static void set_sli_enabled(bool enabled); 02941 static void set_elr_enabled(bool enabled); 02942 02943 static rc_t set_log_features(char const* features); 02944 static char const* get_log_features(); 02945 #endif 02946 02947 /**\brief Acquire a lock. 02948 * \ingroup SSMLOCK 02949 * @param[in] n Lock id of the entity to lock. There are 02950 * conversions from record ids, volume ids, store ids, and page ids to 02951 * lockid_t. 02952 * @param[in] m Desired lock mode. Values: EX, SH. 02953 * @param[in] d Desired duration. Values: 02954 * - t_very_long : Held across transaction boundaries; 02955 * cannot be released by unlock() 02956 * - t_long : Released at commit; cannot be released by unlock() 02957 * - t_medium : May be released early by explicit unlock() 02958 * - t_short : May be released early by explicit unlock() 02959 * - t_instant : Not held: acquired and released immediately. Useful 02960 * to see if any other transaction holds an incompatible lock. 02961 * @param[in] timeout Milliseconds willing to block. See timeout_in_ms. 02962 * 02963 * The lock manager is written with these durations in mind, but the 02964 * only durations used by the storage manager are t_instant and t_long. 02965 * Medium-duration locks are used internally in a one place. 02966 * 02967 * Durations other than long and instant are not well-tested. 02968 */ 02969 static rc_t lock( 02970 const lockid_t& n, 02971 lock_mode_t m, 02972 lock_duration_t d = t_long, 02973 timeout_in_ms timeout = WAIT_SPECIFIED_BY_XCT 02974 ); 02975 02976 /**\brief Release a lock. 02977 * \ingroup SSMLOCK 02978 * @param[in] n Lock id of the entity to lock. There are 02979 * conversions from record ids, volume ids, store ids, and page ids to 02980 * lockid_t. 02981 */ 02982 static rc_t unlock(const lockid_t& n); 02983 02984 /**\brief Disable lock escalation on the given entity. 02985 * \ingroup SSMLOCK 02986 * @param[in] n Lock id of the entity to lock. There are 02987 * conversions from record ids, volume ids, store ids, and page ids to 02988 * lockid_t. 02989 * @param[in] passOnToDescendants If true, apply this to the descendants 02990 * of \a n. 02991 */ 02992 static rc_t dont_escalate( 02993 const lockid_t& n, 02994 bool passOnToDescendants = true 02995 ); 02996 02997 /**\brief Find the storage-manager-wide escalation thresholds 02998 * \ingroup SSMLOCK 02999 * Default values (used for all transactions until they change 03000 * their per-transaction thresholds) are determined by the 03001 * storage-manager-wide options. 03002 * See \ref SSMOPT. 03003 */ 03004 static rc_t get_escalation_thresholds( 03005 w_base_t::int4_t& toPage, 03006 w_base_t::int4_t& toStore, 03007 w_base_t::int4_t& toVolume); 03008 03009 /**\brief Change the storage-manager-wide escalation thresholds 03010 * \ingroup SSMLOCK 03011 * Default values (used for all transactions until they change 03012 * their per-transaction thresholds) are determined by the 03013 * storage-manager-wide options. 03014 * See \ref SSMOPT. 03015 */ 03016 static rc_t set_escalation_thresholds( 03017 w_base_t::int4_t toPage, 03018 w_base_t::int4_t toStore, 03019 w_base_t::int4_t toVolume); 03020 03021 /**\brief Find out if the attached transaction has an entity locked. 03022 * \ingroup SSMLOCK 03023 * @param[in] n Lock id of the entity to lock. There are 03024 * conversions from record ids, volume ids, store ids, and page ids to 03025 * lockid_t. 03026 * @param[out] m Mode of lock held. NL if none. 03027 * @param[in] implicit If "true" the query will returns a lock mode if 03028 * an implicit lock is held, otherwise the lock must be held explicitly. 03029 */ 03030 static rc_t query_lock( 03031 const lockid_t& n, 03032 lock_mode_t& m, 03033 bool implicit = false 03034 ); 03035 03036 /***************************************************************** 03037 * Lock Cache related functions 03038 * 03039 * Each transaction has a cache of recently acquired locks 03040 * The following functions control the use of the cache. 03041 * Note that the functions affect the transaction currently 03042 * associated with the thread. 03043 *****************************************************************/ 03044 // turn on(enable=true) or off/(enable=false) the lock cache 03045 // return previous state. 03046 /**\brief Control lock caching for attached transaction. 03047 * \ingroup SSMLOCK 03048 * 03049 * @param[in] enable Set to true if you want to turn on lock caching 03050 * for the attached transaction. The default is that it is turned on. 03051 * 03052 * Only long-duration locks are cached. 03053 * Lock caching can be turned off by default using the 03054 * sm_lock_caching option. Even with it turned off by default, it 03055 * can be turned on for a given transcation with this method. 03056 * 03057 */ 03058 static rc_t set_lock_cache_enable(bool enable); 03059 03060 /**\brief True if lock cache is enabled for the attached transaction 03061 * \ingroup SSMLOCK 03062 * 03063 * @param[out] enabled Will be set to true if the attached transaction has 03064 * lock caching enabled, false otherwise. 03065 */ 03066 static rc_t lock_cache_enabled(bool& enabled); 03067 03068 private: 03069 03070 static int _instance_cnt; 03071 static option_group_t* _options; 03072 static option_t* _hugetlbfs_path; 03073 static option_t* _reformat_log; 03074 static option_t* _prefetch; 03075 static option_t* _bufpoolsize; 03076 static option_t* _locktablesize; 03077 static option_t* _logdir; 03078 static option_t* _logsize; 03079 static option_t* _logbufsize; 03080 static option_t* _error_log; 03081 static option_t* _error_loglevel; 03082 static option_t* _lockEscalateToPageThreshold; 03083 static option_t* _lockEscalateToStoreThreshold; 03084 static option_t* _lockEscalateToVolumeThreshold; 03085 static option_t* _cc_alg_option; 03086 static option_t* _log_warn_percent; 03087 static option_t* _num_page_writers; 03088 static option_t* _logging; 03089 static option_t* _lock_caching_default; 03090 03091 03092 static rc_t _set_option_logsize( 03093 option_t* opt, 03094 const char* value, 03095 ostream* err_stream); 03096 03097 static rc_t _set_option_lock_escalate_to_page( 03098 option_t* opt, 03099 const char* value, 03100 ostream* err_stream); 03101 03102 static rc_t _set_option_lock_escalate_to_store( 03103 option_t* opt, 03104 const char* value, 03105 ostream* err_stream); 03106 03107 static rc_t _set_option_lock_escalate_to_volume( 03108 option_t* opt, 03109 const char* value, 03110 ostream* err_stream); 03111 03112 static rc_t _set_store_property( 03113 stid_t stid, 03114 store_property_t property); 03115 03116 static rc_t _get_store_property( 03117 stid_t stid, 03118 store_property_t& property); 03119 03120 static rc_t _begin_xct( 03121 sm_stats_info_t* stats, // allocated by caller 03122 tid_t& tid, 03123 timeout_in_ms timeout); 03124 03125 static rc_t _commit_xct( 03126 sm_stats_info_t*& stats, 03127 bool lazy, 03128 lsn_t* plastlsn); 03129 03130 static rc_t _commit_xct_group( 03131 xct_t * list[], 03132 int listlen); 03133 03134 static rc_t _prepare_xct( 03135 sm_stats_info_t*& stats, 03136 vote_t& v); 03137 03138 static rc_t _set_coordinator(const server_handle_t &); 03139 03140 static rc_t _enter_2pc(const gtid_t &); 03141 static rc_t _force_vote_readonly(); 03142 static rc_t _recover_2pc(const gtid_t &,// in 03143 bool mayblock, 03144 tid_t & //out -- attached if found(?) 03145 ); 03146 static rc_t _chain_xct( 03147 sm_stats_info_t*& stats, 03148 bool lazy); 03149 03150 static rc_t _abort_xct( 03151 sm_stats_info_t*& stats); 03152 03153 static rc_t _save_work(sm_save_point_t& sp); 03154 03155 static rc_t _rollback_work(const sm_save_point_t& sp); 03156 static rc_t _mount_dev( 03157 const char* device, 03158 u_int& vol_cnt, 03159 vid_t local_vid); 03160 03161 static rc_t _dismount_dev( 03162 const char* device, 03163 bool dismount_if_locked = true 03164 ); 03165 static rc_t _create_vol( 03166 const char* device_name, 03167 const lvid_t& lvid, 03168 smksize_t quota_KB, 03169 bool skip_raw_init, 03170 const bool apply_fake_io_latency, 03171 const int fake_disk_latency); 03172 03173 static rc_t _create_index( 03174 vid_t vid, 03175 ndx_t ntype, 03176 store_property_t property, 03177 const char* key_desc, 03178 concurrency_t cc, 03179 stid_t& stid 03180 ); 03181 03182 static rc_t _destroy_index(const stid_t& iid); 03183 03184 static rc_t _get_store_info( 03185 const stid_t & stid, 03186 sm_store_info_t& info); 03187 03188 static rc_t _bulkld_index( 03189 const stid_t& stid, 03190 int nsrcs, 03191 const stid_t* source, 03192 sm_du_stats_t& stats, 03193 bool sort_duplicates = true, 03194 bool lexify_keys = true 03195 ); 03196 03197 static rc_t _bulkld_index( 03198 const stid_t& stid, 03199 sort_stream_i& sorted_stream, 03200 sm_du_stats_t& stats 03201 ); 03202 03203 static rc_t _print_index(const stid_t &iid); 03204 03205 static rc_t _create_assoc( 03206 const stid_t & stid, 03207 const vec_t& key, 03208 const vec_t& el 03209 #ifdef SM_DORA 03210 , const bool bIgnoreLocks = false 03211 #endif 03212 ); 03213 03214 static rc_t _destroy_assoc( 03215 const stid_t & stid, 03216 const vec_t& key, 03217 const vec_t& el 03218 #ifdef SM_DORA 03219 , const bool bIgnoreLocks = false 03220 #endif 03221 ); 03222 03223 static rc_t _destroy_all_assoc( 03224 const stid_t& stid, 03225 const vec_t& key, 03226 int& num_removed 03227 ); 03228 static rc_t _find_assoc( 03229 const stid_t& stid, 03230 const vec_t& key, 03231 void* el, 03232 smsize_t& elen, 03233 bool& found 03234 #ifdef SM_DORA 03235 , const bool bIgnoreLocks = false 03236 #endif 03237 ); 03238 03239 // below method overloaded for rtree 03240 static rc_t _create_md_index( 03241 vid_t vid, 03242 ndx_t ntype, 03243 store_property_t property, 03244 stid_t& stid, 03245 int2_t dim=2 03246 ); 03247 03248 static rc_t _destroy_md_index(const stid_t& iid); 03249 03250 static rc_t _destroy_md_assoc( 03251 stid_t stid, 03252 const nbox_t& key, 03253 const vec_t& el); 03254 03255 static rc_t _bulkld_md_index( 03256 const stid_t& stid, 03257 int nsrcs, 03258 const stid_t* source, 03259 sm_du_stats_t& stats, 03260 int2_t hff, // for rtree only 03261 int2_t hef, // for rtree only 03262 nbox_t* universe);// for rtree only 03263 03264 static rc_t _bulkld_md_index( 03265 const stid_t& stid, 03266 sort_stream_i& sorted_stream, 03267 sm_du_stats_t& stats, 03268 int2_t hff, // for rtree only 03269 int2_t hef, // for rtree only 03270 nbox_t* universe);// for rtree only 03271 03272 static rc_t _print_md_index(stid_t stid, ostream &); 03273 03274 static rc_t _create_md_assoc( 03275 stid_t stid, 03276 const nbox_t& key, 03277 const vec_t& el); 03278 03279 static rc_t _find_md_assoc( 03280 stid_t stid, 03281 const nbox_t& key, 03282 void* el, 03283 smsize_t& elen, 03284 bool& found); 03285 03286 // 03287 // The following functions deal with files of records. 03288 // 03289 static rc_t _destroy_n_swap_file( 03290 const stid_t& old_fid, 03291 const stid_t& new_fid); 03292 03293 static rc_t _create_file( 03294 vid_t vid, 03295 stid_t& fid, 03296 store_property_t property, 03297 shpid_t cluster_hint = 0 03298 ); 03299 03300 static rc_t _destroy_file(const stid_t& fid); 03301 03302 static rc_t _create_rec( 03303 const stid_t& fid, 03304 const vec_t& hdr, 03305 smsize_t len_hint, 03306 const vec_t& data, 03307 rid_t& new_rid, 03308 uint4_t policy 03309 #ifdef SM_DORA 03310 , const bool bIgnoreLocks = false 03311 #endif 03312 ); 03313 03314 static rc_t _destroy_rec( 03315 const rid_t& rid 03316 #ifdef SM_DORA 03317 , const bool bIgnoreLocks = false 03318 #endif 03319 ); 03320 03321 static rc_t _update_rec( 03322 const rid_t& rid, 03323 smsize_t start, 03324 const vec_t& data 03325 #ifdef SM_DORA 03326 , const bool bIgnoreLocks = false 03327 #endif 03328 ); 03329 03330 static rc_t _update_rec_hdr( 03331 const rid_t& rid, 03332 smsize_t start, 03333 const vec_t& hdr 03334 #ifdef SM_DORA 03335 , const bool bIgnoreLocks = false 03336 #endif 03337 ); 03338 03339 static rc_t _append_rec( 03340 const rid_t& rid, 03341 const vec_t& data 03342 ); 03343 03344 static rc_t _truncate_rec( 03345 const rid_t& rid, 03346 smsize_t amount, 03347 bool& should_forward 03348 ); 03349 03350 static rc_t _draw_rtree(const stid_t& stid, ostream &); 03351 03352 static rc_t _rtree_stats( 03353 const stid_t& stid, 03354 rtree_stats_t& stat, 03355 uint2_t size, 03356 uint2_t* ovp, 03357 bool audit 03358 ); 03359 03360 #ifdef OLDSORT_COMPATIBILITY 03361 /* old sort internal, physical */ 03362 static rc_t _sort_file( 03363 const stid_t& fid, 03364 vid_t vid, 03365 stid_t& sfid, 03366 store_property_t property, 03367 const key_info_t& key_info, 03368 int run_size, 03369 bool ascending, 03370 bool unique, 03371 bool destructive 03372 ); 03373 #endif /* OLDSORT_COMPATIBILITY */ 03374 03375 /* new sort internal, physical */ 03376 static rc_t _sort_file( 03377 const stid_t& fid, // input file 03378 const stid_t& sorted_fid, // output file -- 03379 // created by caller-- 03380 // can be same as input file 03381 int nvids, // array size for vids 03382 const vid_t* vid, // array of vids for temp 03383 sort_keys_t& kl, // key location info & 03384 smsize_t min_rec_sz, // for estimating space use 03385 int run_size, // # pages to use for a run 03386 int temp_space //# pages VM to use for scratch 03387 ); 03388 03389 03390 #ifdef OLDSORT_COMPATIBILITY 03391 /* internal compatibility old sort-> new sort */ 03392 static rc_t _new_sort_file( 03393 const stid_t& in_fid, 03394 const stid_t& out_fid, 03395 const key_info_t& ki, 03396 int run_size, 03397 bool ascending, 03398 bool unique, 03399 bool keep_orig //!destructive 03400 ); 03401 #endif /* OLDSORT_COMPATIBILITY */ 03402 03403 static store_flag_t _make_store_flag(store_property_t property); 03404 // reverse function: 03405 // static store_property_t _make_store_property(w_base_t::uint4_t flag); 03406 // is in dir_vol_m 03407 03408 // this is for df statistics DU DF 03409 static rc_t _get_du_statistics( 03410 vid_t vid, 03411 sm_du_stats_t& du, 03412 bool audit); 03413 03414 static rc_t _get_du_statistics( 03415 const stid_t & stid, 03416 sm_du_stats_t& du, 03417 bool audit); 03418 03419 static rc_t _get_volume_meta_stats( 03420 vid_t vid, 03421 SmVolumeMetaStats& volume_stats, 03422 concurrency_t cc); 03423 03424 static rc_t _get_file_meta_stats( 03425 vid_t vid, 03426 w_base_t::uint4_t num_files, 03427 SmFileMetaStats* file_stats, 03428 bool batch_calculate, 03429 concurrency_t cc); 03430 }; 03431 03432 /**\brief Information about a store that can be queried by the client. 03433 * \details 03434 * This information is stored in a store directory on the volume. 03435 * It can be queried with ss_m::get_store_info. 03436 */ 03437 class sm_store_info_t { 03438 public: 03439 NORET sm_store_info_t(int len) : 03440 store(0), stype(ss_m::t_bad_store_t), 03441 ntype(ss_m::t_bad_ndx_t), cc(ss_m::t_cc_bad), 03442 eff(0), large_store(0), root(0), 03443 nkc(0), keydescrlen(len) 03444 { keydescr = new char[len]; } 03445 03446 NORET ~sm_store_info_t() { if (keydescr) delete[] keydescr; } 03447 03448 /// store number 03449 snum_t store; 03450 /// t_index, t_file, ... See ss_m::store_t. 03451 u_char stype; 03452 /// t_btree, t_rtree,... See ss_m::ndx_t 03453 u_char ntype; 03454 /// t_cc_kvl, t_cc_record,... See ss_m::concurrency_t 03455 u_char cc; 03456 03457 /// Unused: 03458 u_char eff; 03459 03460 /// Store number for associated large-page store, if there is one. 03461 snum_t large_store; 03462 /// Root page if this is an index. 03463 shpid_t root; 03464 /// Number of key components if this is an index. 03465 w_base_t::uint4_t nkc; 03466 /// Size of key description (if this is an index) 03467 int keydescrlen; 03468 /**\brief Variable length string. 03469 * 03470 * He who creates a sm_store_info_t for use with get_store_info() 03471 * is responsible for allocating enough space for 03472 * key descriptors if he expects to find them. 03473 * See \ref key_description. 03474 */ 03475 char *keydescr; 03476 }; 03477 03478 03479 ostream& operator<<(ostream& o, const vid_t& v); 03480 istream& operator>>(istream& i, vid_t& v); 03481 ostream& operator<<(ostream& o, const extid_t& x); 03482 istream& operator>>(istream& o, extid_t &x); 03483 ostream& operator<<(ostream& o, const stid_t& stid); 03484 istream& operator>>(istream& i, stid_t& stid); 03485 ostream& operator<<(ostream& o, const lpid_t& pid); 03486 istream& operator>>(istream& i, lpid_t& pid); 03487 ostream& operator<<(ostream& o, const shrid_t& r); 03488 istream& operator>>(istream& i, shrid_t& r); 03489 ostream& operator<<(ostream& o, const rid_t& rid); 03490 istream& operator>>(istream& i, rid_t& rid); 03491 ostream& operator<<(ostream& o, const sm_stats_info_t& s); 03492 template<class ostream> 03493 ostream& operator<<(ostream& o, const sm_config_info_t& s) 03494 { 03495 o << " page_size " << s.page_size 03496 << " max_small_rec " << s.max_small_rec 03497 << " lg_rec_page_space " << s.lg_rec_page_space 03498 << " buffer_pool_size " << s.buffer_pool_size 03499 << " max_btree_entry_size " << s.max_btree_entry_size 03500 << " exts_on_page " << s.exts_on_page 03501 << " pages_per_ext " << s.pages_per_ext 03502 << " logging " << s.logging 03503 ; 03504 return o; 03505 } 03506 03507 03508 #ifndef VEC_T_H 03509 #include <vec_t.h> 03510 #endif 03511 03512 #ifndef SM_ESCALATION_H 03513 #include <sm_escalation.h> 03514 #endif 03515 03516 /*<std-footer incl-file-exclusion='SM_H'> -- do not edit anything below this line -- */ 03517 03518 #endif /*</std-footer>*/