00001 /* -*- mode:C++; c-basic-offset:4 -*- 00002 Shore-MT -- Multi-threaded port of the SHORE storage manager 00003 00004 Copyright (c) 2007-2009 00005 Data Intensive Applications and Systems Labaratory (DIAS) 00006 Ecole Polytechnique Federale de Lausanne 00007 00008 All Rights Reserved. 00009 00010 Permission to use, copy, modify and distribute this software and 00011 its documentation is hereby granted, provided that both the 00012 copyright notice and this permission notice appear in all copies of 00013 the software, derivative works or modified versions, and any 00014 portions thereof, and that both notices appear in supporting 00015 documentation. 00016 00017 This code is distributed in the hope that it will be useful, but 00018 WITHOUT ANY WARRANTY; without even the implied warranty of 00019 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS 00020 DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER 00021 RESULTING FROM THE USE OF THIS SOFTWARE. 00022 */ 00023 /*<std-header orig-src='shore' incl-file-exclusion='MEM_BLOCK_CPP'> 00024 00025 $Id: mem_block.cpp,v 1.10 2012/01/02 21:52:21 nhall Exp $ 00026 00027 SHORE -- Scalable Heterogeneous Object REpository 00028 00029 Copyright (c) 1994-99 Computer Sciences Department, University of 00030 Wisconsin -- Madison 00031 All Rights Reserved. 00032 00033 Permission to use, copy, modify and distribute this software and its 00034 documentation is hereby granted, provided that both the copyright 00035 notice and this permission notice appear in all copies of the 00036 software, derivative works or modified versions, and any portions 00037 thereof, and that both notices appear in supporting documentation. 00038 00039 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY 00040 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS 00041 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND 00042 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 00043 00044 This software was developed with support by the Advanced Research 00045 Project Agency, ARPA order number 018 (formerly 8230), monitored by 00046 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518. 00047 Further funding for this work was provided by DARPA through 00048 Rome Research Laboratory Contract No. F30602-97-2-0247. 00049 00050 */ 00051 00052 /**\cond skip */ 00053 00054 #include "w.h" 00055 #include "mem_block.h" 00056 #include "atomic_ops.h" 00057 #include <cstdlib> 00058 #include <stdio.h> 00059 #include <algorithm> 00060 #ifdef __linux 00061 #include <malloc.h> 00062 #endif 00063 00064 // #include <cassert> 00065 #undef assert 00066 void assert_failed(const char *desc, const char *f, int l) { 00067 fprintf(stdout, "Assertion failed: %s at line %d file %s ", desc, l,f); 00068 w_assert0(0); 00069 } 00070 #define assert(x) if (!(x)) assert_failed(#x, __FILE__, __LINE__); 00071 00072 00073 namespace memory_block { 00074 #if 0 /* keep emacs happy */ 00075 } // keep emacs happy... 00076 #endif 00077 00078 block_bits::block_bits(size_t chip_count) 00079 : _usable_chips(create_mask(chip_count)) 00080 , _zombie_chips(0) 00081 { 00082 assert(chip_count <= 8*sizeof(bitmap)); 00083 } 00084 00085 /**\brief acquire chip_count contiguous chips 00086 * by finding adjacent bits in _usable_chips 00087 */ 00088 size_t block_bits::acquire_contiguous(size_t chip_count) { 00089 (void) chip_count; // make gcc happy... 00090 00091 /* find the rightmost set bit. 00092 00093 If the map is smaller than the word size, but logically full we 00094 will compute an index equal to the capacity. If the map is 00095 physically full (_available == 0) then we'll still compute an 00096 index equal to capacity because 0-1 will underflow to all set 00097 bits. Therefore the check for full is the same whether the 00098 bitmap can use all its bits or not. 00099 */ 00100 bitmap one_bit = _usable_chips &- _usable_chips; 00101 size_t index = _popc(one_bit-1); 00102 if(index < 8*sizeof(bitmap)) { 00103 // common case: we have space 00104 assert(index < chip_count); 00105 _usable_chips ^= one_bit; 00106 } 00107 else { 00108 // oops... full 00109 assert(index == 8*sizeof(bitmap)); 00110 } 00111 00112 return index; 00113 } 00114 00115 void block_bits::release_contiguous(size_t index, size_t chip_count) { 00116 // assign this chip to the zombie set for later recycling 00117 (void) chip_count; // keep gcc happy 00118 assert(index < chip_count); 00119 bitmap to_free = bitmap(1) << index; 00120 assert(! (to_free & *usable_chips())); 00121 membar_exit(); 00122 bitmap volatile* ptr = &_zombie_chips; 00123 bitmap ov = *ptr; 00124 while(1) { 00125 bitmap nv = ov | to_free; 00126 bitmap cv = atomic_cas_64(ptr, ov, nv); 00127 if(cv == ov) 00128 break; 00129 ov = cv; 00130 } 00131 bitmap was_free = ov; 00132 (void) was_free; // keep gcc happy 00133 00134 assert( ! (was_free & to_free)); 00135 } 00136 00137 block_bits::bitmap block_bits::create_mask(size_t bits_set) { 00138 // doing it this way allows us to create a bitmap of all ones if need be 00139 return ~bitmap(0) >> (8*sizeof(bitmap) - bits_set); 00140 } 00141 00142 void block_bits::recycle() { 00143 /* recycle the zombies in the block. 00144 00145 Whatever bits have gone zombie since we last recycled become 00146 OR-ed into the set of usable bits. We also XOR them atomically back 00147 into the zombie set to clear them out there. That way we don't 00148 leak bits if a releasing thread races us and adds more bits to the 00149 zombie set after we read it. 00150 */ 00151 bitmap newly_usable = *&_zombie_chips; 00152 _usable_chips |= newly_usable; 00153 membar_exit(); 00154 bitmap volatile* ptr = &_zombie_chips; 00155 bitmap ov = *ptr; 00156 while(1) { 00157 bitmap nv = ov ^ newly_usable; // XOR 00158 bitmap cv = atomic_cas_64(ptr, ov, nv); 00159 if(cv == ov) 00160 break; 00161 ov = cv; 00162 } 00163 } 00164 00165 memory_block::chip_t block_of_chips::acquire_chip(size_t chip_size, size_t chip_count, size_t) 00166 { 00167 size_t index = _bits.acquire_contiguous(chip_count); 00168 memory_block::chip_t result = {0}; 00169 if (index < chip_count) result = get(index, chip_size); 00170 return result; 00171 } 00172 00173 00174 void block_of_chips::release_chip(chip_t ptr, size_t chip_size, size_t chip_count, size_t block_size) 00175 { 00176 /* use pointer arithmetic to find the beginning of our block, 00177 where the block* lives. 00178 00179 Our caller is responsible to be sure this address actually 00180 falls inside a memory block 00181 */ 00182 union { void* v; size_t n; block_of_chips* b; char* c; } u = {ptr.v}, v=u; 00183 u.n &= -block_size; 00184 size_t offset = v.c - u.b->_data; 00185 size_t idx = offset/chip_size; 00186 assert(u.b->_data + idx*chip_size == ptr.v); 00187 u.b->_bits.release_contiguous(idx, chip_count); 00188 } 00189 00190 00191 block_of_chips::block_of_chips(size_t chip_size, size_t chip_count, size_t block_size) 00192 : _bits(chip_count) 00193 , _owner(0) 00194 , _next(0) 00195 { 00196 // make sure all the chips actually fit in this block 00197 char* end_of_block = get(0, chip_size).c -sizeof(this)+block_size; 00198 char* end_of_chips = get(chip_count, chip_size).c; 00199 (void) end_of_block; // keep gcc happy 00200 (void) end_of_chips; // keep gcc happy 00201 assert(end_of_chips <= end_of_block); 00202 00203 /* We purposefully don't check alignment here because some parts 00204 of the impl cheat for blocks which will never be used to 00205 allocate anything (the fake_block being the main culprit). 00206 The pool_of_blocks does check alignment, though. 00207 */ 00208 } 00209 00210 /* chip_size, chip_count and block_size are fixed for any list. We always 00211 * acquire one chip here. 00212 */ 00213 chip_t block_list::acquire(size_t chip_size, size_t chip_count, size_t block_size) 00214 { 00215 // Pull a chip off the tail. If that fails, we'll reorganize and 00216 // try again. 00217 chip_t ptr = _tail->acquire_chip(chip_size, chip_count, block_size); 00218 if(ptr.v) { 00219 _hit_count++; 00220 return ptr; 00221 } 00222 00223 // darn... gotta do it the hard way 00224 return _slow_acquire(chip_size, chip_count, block_size); 00225 } 00226 00227 00228 block_list::block_list(pool_of_blocks* pool, size_t chip_size, size_t chip_count, size_t block_size) 00229 : _fake_block(chip_size, chip_count, block_size) 00230 , _tail(&_fake_block) 00231 , _pool(pool) 00232 , _hit_count(0) 00233 , _avg_hit_rate(0) 00234 { 00235 /* make the fake block advertise that it has nothing to give 00236 00237 The first time the user tries to /acquire/ the fast case will 00238 detect that the fake block is "full" and fall back to the slow 00239 case. The slow case knows about the fake block and will remove 00240 it from the list. 00241 00242 This trick lets us minimize the number of branches required by 00243 the fast path acquire. 00244 */ 00245 _fake_block._bits.fake_full(); 00246 } 00247 00248 00249 chip_t block_list::_slow_acquire(size_t chip_size, 00250 size_t chip_count, size_t block_size) 00251 { 00252 // no hit by looking at the last block in the list, so me 00253 // must rearrange the list or add blocks if necessary 00254 _change_blocks(chip_size, chip_count, block_size); 00255 // try again 00256 return acquire(chip_size, chip_count, block_size); 00257 } 00258 00259 block_of_chips* block_list::acquire_block(size_t block_size) 00260 { 00261 union { block_of_chips* b; uintptr_t n; } u = {_pool->acquire_block(this)}; 00262 (void) block_size; // keep gcc happy 00263 assert((u.n & -block_size) == u.n); 00264 // Note that the block might already have (still-)allocated 00265 // chips so we do not initialize the block. 00266 return u.b; 00267 00268 } 00269 00270 /* Rearrange the list. This is called after we failed to 00271 * acquire a chip from the last block in the list (tail). 00272 * We don't know about other blocks in the list yet. 00273 */ 00274 void block_list::_change_blocks(size_t chip_size, 00275 size_t chip_count, size_t block_size) 00276 { 00277 (void) chip_size; // keep gcc happy 00278 00279 // first time through? 00280 if(_tail == &_fake_block) { 00281 // remove fake block from the list. 00282 _tail = acquire_block(block_size); 00283 _tail->_next = _tail; 00284 return; 00285 } 00286 00287 /* Check whether we're chewing through our blocks too fast for the 00288 current ring size 00289 00290 If we consistently recycle blocks while they're still more than 00291 half full then we need a bigger ring so old blocks have more 00292 time to cool off between reuses. 00293 00294 To respond to end-of-spike gracefully, we're going to be pretty 00295 aggressive about returning blocks to the global pool: when 00296 recycling blocks we check for runs of blocks which do not meet 00297 some minimum utilization threshold, discarding all but one of 00298 them (keep the last for buffering purposes). 00299 */ 00300 00301 // decay_rate is used to compute average hit rate. 00302 // consider (roughly) the last 5 blocks 00303 static double const decay_rate = 1./5; 00304 _avg_hit_rate = _hit_count*(1-decay_rate) + _avg_hit_rate*decay_rate; 00305 00306 // min_allocated is where we would like to see the hit counts 00307 // settle. If we find we are failing to acquire while the hit counts 00308 // are below this, we must increase the #blocks in the list (ring size). 00309 size_t const min_allocated = (chip_count+1)/2; // 50% 00310 00311 00312 // max_available is 00313 // an integral number and less than but close to chip_count; 00314 // TODO: better explanation of this: 00315 // Too many chips available in a block of chips 00316 // suggests we should unload some extra blocks. 00317 // Choose smaller of: chip_count-1 and .9 chip_count. 00318 size_t const max_available = chip_count - std::max((int)(.1*chip_count), 1); 00319 00320 if(_hit_count < min_allocated && _avg_hit_rate < min_allocated) { 00321 // too fast.. grow the ring 00322 block_of_chips* new_block = acquire_block(block_size); 00323 new_block->_next = _tail->_next; 00324 _tail = _tail->_next = new_block; 00325 } 00326 else { 00327 // compress the run, if any 00328 block_of_chips* prev = 0; 00329 block_of_chips* cur = _tail; 00330 block_of_chips* next; 00331 while(1) { 00332 next = cur->_next; 00333 /* make all zombies in the block usable */ 00334 next->recycle(); 00335 00336 /* now see how many of the chips are still in use. If too few, 00337 * move the tail, set hit count to 0 00338 * and return so that the client ends up calling us again 00339 * and acquiring another block to become the new tail. 00340 */ 00341 if(next->_bits.usable_count() <= max_available) { 00342 // cause the tail to move to avoid certain perverse 00343 // behavior when the usable count is 0 but 00344 // chips have been freed at the front of the 00345 // list. We don't 00346 // want to forever allocate new blocks just because there's 00347 // one fully-allocated block in the list. 00348 // By moving the tail, it slowly circulates through the 00349 // list and our first inspection will be of a *diferent* block 00350 // after the newly allocated block is consumed. 00351 cur = next; 00352 break; 00353 } 00354 00355 // This block has plenty of usable chips. Enough in fact 00356 // that it's worth releasing it to the underlying pool. 00357 if(prev) { 00358 assert(prev != cur); 00359 // assert(cur->_bits.usable_count() > max_available); 00360 assert(next->_bits.usable_count() > max_available); 00361 prev->_next = next; 00362 _pool->release_block(cur); 00363 cur = prev; // reset 00364 } 00365 00366 // avoid the endless loop 00367 if(next == _tail) break; 00368 00369 prev = cur; 00370 cur = next; 00371 } // while 00372 00373 // recycle, repair the ring, and advance 00374 // NB: if we broke out of the while loop on the first try, 00375 // we will not mave moved the tail at all. 00376 _tail = cur; 00377 } 00378 00379 // # fast acquires(hits) since last _change_blocks 00380 _hit_count = 0; 00381 } 00382 00383 /* recycle all blocks in the list. This is to force 00384 * recycling of the heap after a spike of releases, 00385 * w/o waiting for the next acquire. 00386 */ 00387 void block_list::recycle(size_t chip_size, 00388 size_t chip_count, size_t block_size) 00389 { 00390 _hit_count = chip_count; // artificially set _hit_count 00391 // so when we call _change_block we can't possibly 00392 // add a block to the list, and we try to compress instead. 00393 _change_blocks(chip_size, chip_count, block_size); 00394 } 00395 00396 block_list::~block_list() { 00397 00398 // don't free the fake block if we went unused! 00399 if(_tail == &_fake_block) return; 00400 00401 // break the cycle so the loop terminates 00402 block_of_chips* cur = _tail->_next; 00403 _tail->_next = 0; 00404 00405 // release blocks until we hit the NULL 00406 while( (cur=_pool->release_block(cur)) ) { 00407 } 00408 } 00409 00410 /**\endcond skip */ 00411 } // namespace memory_block