00001 /* -*- mode:C++; c-basic-offset:4 -*- 00002 Shore-MT -- Multi-threaded port of the SHORE storage manager 00003 00004 Copyright (c) 2007-2009 00005 Data Intensive Applications and Systems Labaratory (DIAS) 00006 Ecole Polytechnique Federale de Lausanne 00007 00008 All Rights Reserved. 00009 00010 Permission to use, copy, modify and distribute this software and 00011 its documentation is hereby granted, provided that both the 00012 copyright notice and this permission notice appear in all copies of 00013 the software, derivative works or modified versions, and any 00014 portions thereof, and that both notices appear in supporting 00015 documentation. 00016 00017 This code is distributed in the hope that it will be useful, but 00018 WITHOUT ANY WARRANTY; without even the implied warranty of 00019 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS 00020 DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER 00021 RESULTING FROM THE USE OF THIS SOFTWARE. 00022 */ 00023 00024 /*<std-header orig-src='shore' incl-file-exclusion='SM_BASE_H'> 00025 00026 $Id: sm_base.h,v 1.158 2010/12/08 17:37:43 nhall Exp $ 00027 00028 SHORE -- Scalable Heterogeneous Object REpository 00029 00030 Copyright (c) 1994-99 Computer Sciences Department, University of 00031 Wisconsin -- Madison 00032 All Rights Reserved. 00033 00034 Permission to use, copy, modify and distribute this software and its 00035 documentation is hereby granted, provided that both the copyright 00036 notice and this permission notice appear in all copies of the 00037 software, derivative works or modified versions, and any portions 00038 thereof, and that both notices appear in supporting documentation. 00039 00040 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY 00041 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS 00042 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND 00043 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 00044 00045 This software was developed with support by the Advanced Research 00046 Project Agency, ARPA order number 018 (formerly 8230), monitored by 00047 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518. 00048 Further funding for this work was provided by DARPA through 00049 Rome Research Laboratory Contract No. F30602-97-2-0247. 00050 00051 */ 00052 00053 #ifndef SM_BASE_H 00054 #define SM_BASE_H 00055 00056 #include "w_defines.h" 00057 00058 /* -- do not edit anything above this line -- </std-header>*/ 00059 00060 /**\file sm_base.h 00061 * \ingroup Macros 00062 */ 00063 00064 #ifdef __GNUG__ 00065 #pragma interface 00066 #endif 00067 00068 #include <climits> 00069 #ifndef OPTION_H 00070 #include "option.h" 00071 #endif 00072 #ifndef __opt_error_def_gen_h__ 00073 #include "opt_error_def_gen.h" 00074 #endif 00075 00076 00077 class ErrLog; 00078 class sm_stats_info_t; 00079 class xct_t; 00080 class xct_i; 00081 00082 class device_m; 00083 class io_m; 00084 class bf_m; 00085 class comm_m; 00086 class log_m; 00087 class lock_m; 00088 00089 class tid_t; 00090 class option_t; 00091 00092 #ifndef SM_EXTENTSIZE 00093 #define SM_EXTENTSIZE 8 00094 #endif 00095 #ifndef SM_LOG_PARTITIONS 00096 #define SM_LOG_PARTITIONS 8 00097 #endif 00098 00099 typedef w_rc_t rc_t; 00100 00101 00102 /**\cond skip 00103 * This structure collects the depth on construction 00104 * and checks that it matches the depth on destruction; this 00105 * is to ensure that we haven't forgotten to release 00106 * an anchor somewhere. 00107 * 00108 * We're defining the CHECK_NESTING_VARIABLES macro b/c 00109 * this work is spread out and we want to have 1 place to 00110 * determine whether it's turned on or off; don't want to 00111 * make the mistake of changing the debug level (on which 00112 * it depends) in only one of several places. 00113 * 00114 * NOTE: this doesn't work in a multi-threaded xct context. 00115 * That's b/c the check is too late -- once the count goes 00116 * to zero, another thread can change it and throw off all the 00117 * counts. To be sure, we'd have to use a TLS copy as well 00118 * as the common copy of these counts. 00119 * 00120 * This was on for debug level > 0 but it's been stable 00121 * enough to change it to > 2 00122 */ 00123 #if W_DEBUG_LEVEL > 2 00124 #define CHECK_NESTING_VARIABLES 1 00125 #else 00126 #define CHECK_NESTING_VARIABLES 0 00127 #endif 00128 struct check_compensated_op_nesting { 00129 #if CHECK_NESTING_VARIABLES 00130 xct_t* _xd; 00131 int _depth; 00132 int _line; 00133 const char *const _file; 00134 // static methods are so we can avoid having to 00135 // include xct.h here. 00136 static int compensated_op_depth(xct_t* xd, int dflt); 00137 00138 check_compensated_op_nesting(xct_t* xd, int line, const char *const file) 00139 : _xd(xd), 00140 _depth(_xd? compensated_op_depth(_xd, 0) : 0), 00141 _line(line), 00142 _file(file) 00143 { 00144 } 00145 00146 ~check_compensated_op_nesting() { 00147 if(_xd) { 00148 if( _depth != compensated_op_depth(_xd, _depth) ) { 00149 fprintf(stderr, 00150 "th.%d check_compensated_op_nesting(%d,%s) depth was %d is %d\n", 00151 sthread_t::me()->id, 00152 _line, _file, _depth, compensated_op_depth(_xd, _depth)); 00153 } 00154 00155 00156 w_assert0(_depth == compensated_op_depth(_xd, _depth)); 00157 } 00158 } 00159 #else 00160 check_compensated_op_nesting(xct_t*, int, const char *const) { } 00161 #endif 00162 }; 00163 00164 00165 /**\brief Encapsulates a few types uses in the API */ 00166 class smlevel_0 : public w_base_t { 00167 public: 00168 // Give these enums names for doxygen purposes: 00169 enum error_constant_t { eNOERROR = 0, eFAILURE = -1 }; 00170 enum sm_constant_t { 00171 page_sz = SM_PAGESIZE, // page size (SM_PAGESIZE is set by makemake) 00172 ext_sz = SM_EXTENTSIZE, // extent size 00173 max_exts = max_int4, // max no. extents, must fit extnum_t 00174 #if defined(_POSIX_PATH_MAX) 00175 max_devname = _POSIX_PATH_MAX, // max length of unix path name 00176 // BEWARE: this might be larger than you want. Array sizes depend on it. 00177 // The default might be small enough, e.g., 256; getconf() yields the upper 00178 // bound on this value. 00179 #elif defined(MAXPATHLEN) 00180 max_devname = MAXPATHLEN, 00181 #else 00182 max_devname = 1024, 00183 #endif 00184 max_vols = 20, // max mounted volumes 00185 max_xct_thread = 20, // max threads in a xct 00186 max_servers = 15, // max servers to be connected with 00187 max_keycomp = 20, // max key component (for btree) 00188 max_openlog = SM_LOG_PARTITIONS, // max # log partitions 00189 max_dir_cache = max_vols * 10, 00190 00191 /* XXX I want to propogate sthread_t::iovec_max here, but 00192 it doesn't work because of sm_app.h not including 00193 the thread package. */ 00194 max_many_pages = 8, 00195 00196 srvid_map_sz = (max_servers - 1) / 8 + 1, 00197 ext_map_sz_in_bytes = ((ext_sz + 7) / 8), 00198 00199 dummy = 0 00200 }; 00201 00202 enum { 00203 max_rec_len = max_uint4 00204 }; 00205 00206 typedef sthread_base_t::fileoff_t fileoff_t; 00207 /* 00208 * Sizes-in-Kbytes for for things like volumes and devices. 00209 * A KB is assumes to be 1024 bytes. 00210 * Note: a different type was used for added type checking. 00211 */ 00212 typedef sthread_t::fileoff_t smksize_t; 00213 typedef w_base_t::base_stat_t base_stat_t; 00214 00215 /**\endcond skip */ 00216 00217 /* 00218 * rather than automatically aborting the transaction, when the 00219 * _log_warn_percent is exceeded, this callback is made, with a 00220 * pointer to the xct that did the writing, and with the 00221 * expectation that the result will be one of: 00222 * - return value == RCOK --> proceed 00223 * - return value == eUSERABORT --> victim to abort is given in the argument 00224 * 00225 * The server has the responsibility for choosing a victim and 00226 * for aborting the victim transaction. 00227 * 00228 */ 00229 00230 /**\brief Log space warning callback function type. 00231 * 00232 * For more details of how this is used, see the constructor ss_m::ss_m(). 00233 * 00234 * Storage manager methods check the available log space. 00235 * If the log is in danger of filling to the point that it will be 00236 * impossible to abort a transaction, a 00237 * callback is made to the server. The callback function is of this type. 00238 * The danger point is a threshold determined by the option sm_log_warn. 00239 * 00240 * The callback 00241 * function is meant to choose a victim xct and 00242 * tell if the xct should be 00243 * aborted by returning RC(eUSERABORT). 00244 * 00245 * Any other RC value is returned to the server through the call stack. 00246 * 00247 * The arguments: 00248 * @param[in] iter Pointer to an iterator over all xcts. 00249 * @param[out] victim Victim will be returned here. This is an in/out 00250 * paramter and is initially populated with the transaction that is 00251 * attached to the running thread. 00252 * @param[in] curr Bytes of log consumed by active transactions. 00253 * @param[in] thresh Threshhold just exceeded. 00254 * @param[in] logfile Character string name of oldest file to archive. 00255 * 00256 * This function must be careful not to return the same victim more 00257 * than once, even though the callback may be called many 00258 * times before the victim is completely aborted. 00259 * 00260 * When this function has archived the given log file, it needs 00261 * to notify the storage manager of that fact by calling 00262 * ss_m::log_file_was_archived(logfile) 00263 */ 00264 typedef w_rc_t (*LOG_WARN_CALLBACK_FUNC) ( 00265 xct_i* iter, 00266 xct_t *& victim, 00267 fileoff_t curr, 00268 fileoff_t thresh, 00269 const char *logfile 00270 ); 00271 /**\brief Callback function type for restoring an archived log file. 00272 * 00273 * @param[in] fname Original file name (with path). 00274 * @param[in] needed Partition number of the file needed. 00275 * 00276 * An alternative to aborting a transaction (when the log fills) 00277 * is to archive log files. 00278 * The server can use the log directory name to locate these files, 00279 * and may use the iterator and the static methods of xct_t to 00280 * determine which log file(s) to archive. 00281 * 00282 * Archiving and removing the older log files will work only if 00283 * the server also provides a LOG_ARCHIVED_CALLBACK_FUNCTION 00284 * to restore the 00285 * archived log files when the storage manager needs them for 00286 * rollback. 00287 * This is the function type used for that purpose. 00288 * 00289 * The function must locate the archived log file containing for the 00290 * partition number \a num, which was a suffix of the original log file's 00291 * name. 00292 * The log file must be restored with its original name. 00293 */ 00294 typedef w_base_t::uint4_t partition_number_t; 00295 typedef w_rc_t (*LOG_ARCHIVED_CALLBACK_FUNC) ( 00296 const char *fname, 00297 partition_number_t num 00298 ); 00299 00300 /**\cond skip */ 00301 enum switch_t { 00302 ON = 1, 00303 OFF = 0 00304 }; 00305 /**\endcond skip */ 00306 00307 /**\brief Comparison types used in scan_index_i 00308 * \enum cmp_t 00309 * Shorthand for CompareOp. 00310 */ 00311 enum cmp_t { bad_cmp_t=badOp, eq=eqOp, 00312 gt=gtOp, ge=geOp, lt=ltOp, le=leOp }; 00313 00314 00315 /* used by lock escalation routines */ 00316 enum escalation_options { 00317 dontEscalate = max_int4_minus1, 00318 dontEscalateDontPassOn, 00319 dontModifyThreshold = -1 00320 }; 00321 00322 /**\brief Types of stores. 00323 * \enum store_t 00324 */ 00325 enum store_t { 00326 t_bad_store_t, 00327 /// a b-tree or r-tree index 00328 t_index, 00329 /// a file of records 00330 t_file, 00331 /// t_lgrec is used for storing large record pages 00332 /// and is always associated with some t_file store 00333 t_lgrec 00334 }; 00335 00336 // types of indexes 00337 00338 /**\brief Index types */ 00339 enum ndx_t { 00340 t_bad_ndx_t, // illegal value 00341 t_btree, // B+tree with duplicates 00342 t_uni_btree, // Unique-key btree 00343 t_rtree // R*tree 00344 }; 00345 00346 /**\enum concurrency_t 00347 * \brief 00348 * Lock granularities 00349 * \details 00350 * - t_cc_bad Illegal 00351 * - t_cc_none No locking 00352 * - t_cc_record Record-level locking for files & records 00353 * - t_cc_page Page-level locking for files & records 00354 * - t_cc_file File-level locking for files & records 00355 * - t_cc_vol Volume-level locking for files and indexes 00356 * - t_cc_kvl Key-value locking for B+-Tree indexes 00357 * - t_cc_im Aries IM locking for B+-Tree indexes : experimental 00358 * - t_cc_modkvl Modified key-value locking: experimental 00359 * - t_cc_append Used internally \todo true? 00360 */ 00361 enum concurrency_t { 00362 t_cc_bad, // this is an illegal value 00363 t_cc_none, // no locking 00364 t_cc_record, // record-level 00365 t_cc_page, // page-level 00366 t_cc_file, // file-level 00367 t_cc_vol, 00368 t_cc_kvl, // key-value 00369 t_cc_im, // ARIES IM, not supported yet 00370 t_cc_modkvl, // modified ARIES KVL, for paradise use 00371 t_cc_append // append-only with scan_file_i 00372 }; 00373 00374 /**\enum pg_policy_t 00375 * \brief 00376 * File-compaction policy for creating records. 00377 * \details 00378 * - t_append : append new record to file (preserve order) 00379 * - t_cache : look in cache for pages with space for new record (does 00380 * not preserve order) 00381 * - t_compact: keep file compact even if it means searching the file 00382 * for space in which to create the file (does not preserve 00383 * order) 00384 * 00385 * These are masks - the following combinations are sensible: 00386 * 00387 * - t_append -- preserve sort order 00388 * - t_cache | t_append -- check the cache first, 00389 * append if no luck 00390 * - t_cache | t_compact | t_append -- append to file as a last resort 00391 */ 00392 enum pg_policy_t { 00393 t_append = 0x01, // retain sort order (cache 0 pages) 00394 t_cache = 0x02, // look in n cached pgs 00395 t_compact = 0x04 // scan file for space in pages 00396 00397 }; 00398 00399 /**\cond skip */ 00400 00401 /* 00402 * smlevel_0::operating_mode is always set to 00403 * ONE of these, but the function in_recovery() tests for 00404 * any of them, so we'll give them bit-mask values 00405 */ 00406 enum operating_mode_t { 00407 t_not_started = 0, 00408 t_in_analysis = 0x1, 00409 t_in_redo = 0x2, 00410 t_in_undo = 0x4, 00411 t_forward_processing = 0x8 00412 }; 00413 00414 static concurrency_t cc_alg; // concurrency control algorithm 00415 static bool cc_adaptive; // is PS-AA (adaptive) algorithm used? 00416 00417 #include "e_error_enum_gen.h" 00418 00419 static const w_error_info_t error_info[]; 00420 static void init_errorcodes(); 00421 00422 static void add_to_global_stats(const sm_stats_info_t &from); 00423 static void add_from_global_stats(sm_stats_info_t &to); 00424 00425 static device_m* dev; 00426 static io_m* io; 00427 static bf_m* bf; 00428 static lock_m* lm; 00429 00430 static log_m* log; 00431 static tid_t* redo_tid; 00432 00433 static LOG_WARN_CALLBACK_FUNC log_warn_callback; 00434 static LOG_ARCHIVED_CALLBACK_FUNC log_archived_callback; 00435 static fileoff_t log_warn_trigger; 00436 static int log_warn_exceed_percent; 00437 00438 static int dcommit_timeout; // to convey option to coordinator, 00439 // if it is created by VAS 00440 00441 static ErrLog* errlog; 00442 00443 static bool shutdown_clean; 00444 static bool shutting_down; 00445 static bool logging_enabled; 00446 static bool lock_caching_default; 00447 static bool do_prefetch; 00448 00449 static operating_mode_t operating_mode; 00450 static bool in_recovery() { 00451 return ((operating_mode & 00452 (t_in_redo | t_in_undo | t_in_analysis)) !=0); } 00453 static bool in_recovery_analysis() { 00454 return ((operating_mode & t_in_analysis) !=0); } 00455 static bool in_recovery_undo() { 00456 return ((operating_mode & t_in_undo ) !=0); } 00457 static bool in_recovery_redo() { 00458 return ((operating_mode & t_in_redo ) !=0); } 00459 00460 // these variable are the default values for lock escalation counts 00461 static w_base_t::int4_t defaultLockEscalateToPageThreshold; 00462 static w_base_t::int4_t defaultLockEscalateToStoreThreshold; 00463 static w_base_t::int4_t defaultLockEscalateToVolumeThreshold; 00464 00465 // These variables control the size of the log. 00466 static fileoff_t max_logsz; // max log file size 00467 00468 // This variable controls checkpoint frequency. 00469 // Checkpoints are taken every chkpt_displacement bytes 00470 // written to the log. 00471 static fileoff_t chkpt_displacement; 00472 00473 // The volume_format_version is used to test compatability 00474 // of software with a volume. Whenever a change is made 00475 // to the SM software that makes it incompatible with 00476 // previouly formatted volumes, this volume number should 00477 // be incremented. The value is set in sm.cpp. 00478 static w_base_t::uint4_t volume_format_version; 00479 00480 // This is a zeroed page for use wherever initialized memory 00481 // is needed. 00482 static char zero_page[page_sz]; 00483 00484 // option for controlling background buffer flush thread 00485 static option_t* _backgroundflush; 00486 00487 00488 /* 00489 * Pre-defined store IDs -- see also vol.h 00490 * 0 -- is reserved for the extent map and the store map 00491 * 1 -- directory (see dir.cpp) 00492 * 2 -- root index (see sm.cpp) 00493 */ 00494 enum { 00495 store_id_extentmap = 0, 00496 store_id_directory = 1, 00497 store_id_root_index = 2 00498 }; 00499 00500 enum { 00501 eINTERNAL = fcINTERNAL, 00502 eOS = fcOS, 00503 eOUTOFMEMORY = fcOUTOFMEMORY, 00504 eNOTFOUND = fcNOTFOUND, 00505 eNOTIMPLEMENTED = fcNOTIMPLEMENTED 00506 }; 00507 00508 enum store_flag_t { 00509 // NB: this had better match sm_store_property_t (sm_int_3.h) !!! 00510 // or at least be convted properly every time we come through the API 00511 st_bad = 0x0, 00512 st_regular = 0x01, // fully logged 00513 st_tmp = 0x02, // space logging only, 00514 // file destroy on dismount/restart 00515 st_load_file = 0x04, // not stored in the stnode_t, 00516 // only passed down to 00517 // io_m and then converted to tmp and added to the 00518 // list of load files for the xct. 00519 // no longer needed 00520 st_insert_file = 0x08, // stored in stnode, but not on page. 00521 // new pages are saved as tmp, old pages as regular. 00522 st_empty = 0x100 // store might be empty - used ONLY 00523 // as a function argument, NOT stored 00524 // persistently. Nevertheless, it's 00525 // defined here to be sure that if other 00526 // store flags are added, this doesn't 00527 // conflict with them. 00528 }; 00529 00530 /* 00531 * for use by set_store_deleting_log; 00532 * type of operation to perform on the stnode 00533 */ 00534 enum store_operation_t { 00535 t_delete_store, 00536 t_create_store, 00537 t_set_deleting, 00538 t_set_store_flags, 00539 t_set_first_ext}; 00540 00541 enum store_deleting_t { 00542 t_not_deleting_store = 0, // must be 0: code assumes it 00543 t_deleting_store, 00544 t_store_freeing_exts, 00545 t_unknown_deleting}; 00546 /**\endcond skip */ 00547 }; 00548 00549 /**\cond skip */ 00550 ostream& 00551 operator<<(ostream& o, smlevel_0::store_flag_t flag); 00552 00553 ostream& 00554 operator<<(ostream& o, const smlevel_0::store_operation_t op); 00555 00556 ostream& 00557 operator<<(ostream& o, const smlevel_0::store_deleting_t value); 00558 00559 /**\endcond skip */ 00560 00561 /*<std-footer incl-file-exclusion='SM_BASE_H'> -- do not edit anything below this line -- */ 00562 00563 #endif /*</std-footer>*/