Shore Storage Manager: src/fc/mem

00001 /* -*- mode:C++; c-basic-offset:4 -*-
00002      Shore-MT -- Multi-threaded port of the SHORE storage manager
00003    
00004                        Copyright (c) 2007-2009
00005       Data Intensive Applications and Systems Labaratory (DIAS)
00006                Ecole Polytechnique Federale de Lausanne
00007    
00008                          All Rights Reserved.
00009    
00010    Permission to use, copy, modify and distribute this software and
00011    its documentation is hereby granted, provided that both the
00012    copyright notice and this permission notice appear in all copies of
00013    the software, derivative works or modified versions, and any
00014    portions thereof, and that both notices appear in supporting
00015    documentation.
00016    
00017    This code is distributed in the hope that it will be useful, but
00018    WITHOUT ANY WARRANTY; without even the implied warranty of
00019    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS
00020    DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER
00021    RESULTING FROM THE USE OF THIS SOFTWARE.
00022 */
00023 /*<std-header orig-src='shore' incl-file-exclusion='MEM_BLOCK_CPP'>
00024 
00025  $Id: mem_block.cpp,v 1.10 2012/01/02 21:52:21 nhall Exp $
00026 
00027 SHORE -- Scalable Heterogeneous Object REpository
00028 
00029 Copyright (c) 1994-99 Computer Sciences Department, University of
00030                       Wisconsin -- Madison
00031 All Rights Reserved.
00032 
00033 Permission to use, copy, modify and distribute this software and its
00034 documentation is hereby granted, provided that both the copyright
00035 notice and this permission notice appear in all copies of the
00036 software, derivative works or modified versions, and any portions
00037 thereof, and that both notices appear in supporting documentation.
00038 
00039 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY
00040 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS
00041 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND
00042 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
00043 
00044 This software was developed with support by the Advanced Research
00045 Project Agency, ARPA order number 018 (formerly 8230), monitored by
00046 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518.
00047 Further funding for this work was provided by DARPA through
00048 Rome Research Laboratory Contract No. F30602-97-2-0247.
00049 
00050 */
00051 
00052 /**\cond skip */
00053 
00054 #include "w.h"
00055 #include "mem_block.h"
00056 #include "atomic_ops.h"
00057 #include <cstdlib>
00058 #include <stdio.h>
00059 #include <algorithm>
00060 #ifdef __linux
00061 #include <malloc.h>
00062 #endif
00063 
00064 // #include <cassert>
00065 #undef assert
00066 void assert_failed(const char *desc, const char *f, int l) {
00067     fprintf(stdout, "Assertion failed: %s at line %d file %s ", desc, l,f);
00068     w_assert0(0);
00069 }
00070 #define assert(x)   if (!(x)) assert_failed(#x, __FILE__, __LINE__);
00071 
00072 
00073 namespace memory_block {
00074 #if 0 /* keep emacs happy */
00075 } // keep emacs happy...
00076 #endif
00077 
00078 block_bits::block_bits(size_t chip_count)
00079     : _usable_chips(create_mask(chip_count))
00080     , _zombie_chips(0)
00081 {
00082     assert(chip_count <= 8*sizeof(bitmap));
00083 }
00084 
00085 /**\brief acquire chip_count contiguous chips 
00086  * by finding adjacent bits in _usable_chips
00087  */
00088 size_t block_bits::acquire_contiguous(size_t chip_count) {
00089     (void) chip_count; // make gcc happy...
00090     
00091     /* find the rightmost set bit.
00092 
00093        If the map is smaller than the word size, but logically full we
00094        will compute an index equal to the capacity. If the map is
00095        physically full (_available == 0) then we'll still compute an
00096        index equal to capacity because 0-1 will underflow to all set
00097        bits. Therefore the check for full is the same whether the
00098        bitmap can use all its bits or not.
00099      */
00100     bitmap one_bit = _usable_chips &- _usable_chips;
00101     size_t index = _popc(one_bit-1);
00102     if(index < 8*sizeof(bitmap)) {
00103         // common case: we have space
00104         assert(index < chip_count);
00105         _usable_chips ^= one_bit;
00106     }
00107     else {
00108         // oops... full
00109         assert(index == 8*sizeof(bitmap));
00110     }
00111 
00112     return index;
00113 }
00114 
00115 void block_bits::release_contiguous(size_t index, size_t chip_count) {
00116     // assign this chip to the zombie set for later recycling
00117     (void) chip_count; // keep gcc happy
00118     assert(index < chip_count);
00119     bitmap to_free = bitmap(1) << index;
00120     assert(! (to_free & *usable_chips()));
00121     membar_exit();
00122     bitmap volatile* ptr = &_zombie_chips;
00123     bitmap ov = *ptr;
00124     while(1) {
00125         bitmap nv = ov | to_free;
00126         bitmap cv = atomic_cas_64(ptr, ov, nv);
00127         if(cv == ov)
00128             break;
00129         ov = cv;
00130     }
00131     bitmap was_free = ov;
00132     (void) was_free; // keep gcc happy
00133 
00134     assert( ! (was_free & to_free));
00135 }
00136 
00137 block_bits::bitmap block_bits::create_mask(size_t bits_set) {
00138     // doing it this way allows us to create a bitmap of all ones if need be
00139     return ~bitmap(0) >> (8*sizeof(bitmap) - bits_set);
00140 }
00141 
00142 void block_bits::recycle() {
00143     /* recycle the zombies in the block.
00144 
00145        Whatever bits have gone zombie since we last recycled become
00146        OR-ed into the set of usable bits. We also XOR them atomically back
00147        into the zombie set to clear them out there. That way we don't
00148        leak bits if a releasing thread races us and adds more bits to the
00149        zombie set after we read it.
00150     */
00151     bitmap newly_usable = *&_zombie_chips;
00152     _usable_chips |= newly_usable;
00153     membar_exit();
00154     bitmap volatile* ptr = &_zombie_chips;
00155     bitmap ov = *ptr;
00156     while(1) {
00157         bitmap nv = ov ^ newly_usable; // XOR
00158         bitmap cv = atomic_cas_64(ptr, ov, nv);
00159         if(cv == ov)
00160             break;
00161         ov = cv;
00162     }
00163 }
00164 
00165 memory_block::chip_t block_of_chips::acquire_chip(size_t chip_size, size_t chip_count, size_t) 
00166 {
00167     size_t index = _bits.acquire_contiguous(chip_count); 
00168     memory_block::chip_t result = {0};
00169     if (index < chip_count) result = get(index, chip_size);
00170     return result;
00171 }
00172 
00173 
00174 void block_of_chips::release_chip(chip_t ptr, size_t chip_size, size_t chip_count, size_t block_size)
00175 {
00176     /* use pointer arithmetic to find the beginning of our block,
00177        where the block* lives.
00178 
00179        Our caller is responsible to be sure this address actually
00180        falls inside a memory block
00181      */
00182     union { void* v; size_t n; block_of_chips* b; char* c; } u = {ptr.v}, v=u;
00183     u.n &= -block_size;
00184     size_t offset = v.c - u.b->_data;
00185     size_t idx = offset/chip_size;
00186     assert(u.b->_data + idx*chip_size == ptr.v);
00187     u.b->_bits.release_contiguous(idx, chip_count);
00188 }
00189 
00190 
00191 block_of_chips::block_of_chips(size_t chip_size, size_t chip_count, size_t block_size)
00192     : _bits(chip_count)
00193     , _owner(0)
00194     , _next(0)
00195 {
00196     // make sure all the chips actually fit in this block
00197     char* end_of_block = get(0, chip_size).c -sizeof(this)+block_size;
00198     char* end_of_chips = get(chip_count, chip_size).c;
00199     (void) end_of_block; // keep gcc happy
00200     (void) end_of_chips; // keep gcc happy
00201     assert(end_of_chips <= end_of_block);
00202     
00203     /* We purposefully don't check alignment here because some parts
00204        of the impl cheat for blocks which will never be used to
00205        allocate anything (the fake_block being the main culprit).
00206        The pool_of_blocks does check alignment, though.
00207      */
00208 }
00209 
00210 /* chip_size, chip_count and block_size are fixed for any list. We always
00211  * acquire one chip here.
00212  */
00213 chip_t block_list::acquire(size_t chip_size, size_t chip_count, size_t block_size)
00214 {
00215     // Pull a chip off the tail. If that fails, we'll reorganize and
00216     // try again.
00217     chip_t ptr = _tail->acquire_chip(chip_size, chip_count, block_size);
00218     if(ptr.v) {
00219         _hit_count++;
00220         return ptr;
00221     }
00222 
00223     // darn... gotta do it the hard way
00224     return _slow_acquire(chip_size, chip_count, block_size);
00225 }
00226 
00227 
00228 block_list::block_list(pool_of_blocks* pool, size_t chip_size, size_t chip_count, size_t block_size)
00229     : _fake_block(chip_size, chip_count, block_size)
00230     , _tail(&_fake_block)
00231     , _pool(pool)
00232     , _hit_count(0)
00233     , _avg_hit_rate(0)
00234 {
00235     /* make the fake block advertise that it has nothing to give
00236 
00237        The first time the user tries to /acquire/ the fast case will
00238        detect that the fake block is "full" and fall back to the slow
00239        case. The slow case knows about the fake block and will remove
00240        it from the list.
00241 
00242        This trick lets us minimize the number of branches required by
00243        the fast path acquire.
00244      */
00245     _fake_block._bits.fake_full();
00246 }
00247 
00248 
00249 chip_t block_list::_slow_acquire(size_t chip_size, 
00250         size_t chip_count, size_t block_size)
00251 {
00252     // no hit by looking at the last block in the list, so me
00253     // must rearrange the list or add blocks if necessary
00254     _change_blocks(chip_size, chip_count, block_size);
00255     // try again
00256     return acquire(chip_size, chip_count, block_size);
00257 }
00258 
00259 block_of_chips* block_list::acquire_block(size_t block_size)
00260 {
00261     union { block_of_chips* b; uintptr_t n; } u = {_pool->acquire_block(this)};
00262     (void) block_size; // keep gcc happy
00263     assert((u.n & -block_size) == u.n);
00264     // Note that the block might already have (still-)allocated
00265     // chips so we do not initialize the block.
00266     return u.b;
00267     
00268 }
00269 
00270 /* Rearrange the list.  This is called after we failed to 
00271  * acquire a chip from the last block in the list (tail).
00272  * We don't know about other blocks in the list yet.
00273  */
00274 void block_list::_change_blocks(size_t chip_size, 
00275         size_t chip_count, size_t block_size)
00276 {
00277     (void) chip_size; // keep gcc happy
00278 
00279     // first time through?
00280     if(_tail == &_fake_block) {
00281         // remove fake block from the list.
00282         _tail = acquire_block(block_size);
00283         _tail->_next = _tail;
00284         return;
00285     }
00286     
00287     /* Check whether we're chewing through our blocks too fast for the
00288        current ring size
00289 
00290        If we consistently recycle blocks while they're still more than
00291        half full then we need a bigger ring so old blocks have more
00292        time to cool off between reuses.
00293        
00294        To respond to end-of-spike gracefully, we're going to be pretty
00295        aggressive about returning blocks to the global pool: when
00296        recycling blocks we check for runs of blocks which do not meet
00297        some minimum utilization threshold, discarding all but one of
00298        them (keep the last for buffering purposes).
00299     */
00300 
00301     // decay_rate is used to compute average hit rate.
00302     // consider (roughly) the last 5 blocks
00303     static double const    decay_rate = 1./5; 
00304     _avg_hit_rate = _hit_count*(1-decay_rate) + _avg_hit_rate*decay_rate;
00305 
00306     // min_allocated is where we would like to see the hit counts 
00307     // settle.  If we find we are failing to acquire while the hit counts
00308     // are below this, we must increase the #blocks in the list (ring size).
00309     size_t const min_allocated = (chip_count+1)/2; // 50%
00310     
00311     
00312     // max_available is 
00313     // an integral number and less than but close to chip_count;
00314     // TODO: better explanation of this:
00315     // Too many chips available in a block of chips
00316     // suggests we should unload some extra blocks.
00317     // Choose smaller of: chip_count-1 and .9 chip_count.  
00318     size_t const max_available = chip_count - std::max((int)(.1*chip_count), 1);
00319 
00320     if(_hit_count < min_allocated && _avg_hit_rate < min_allocated) {
00321         // too fast.. grow the ring
00322         block_of_chips* new_block = acquire_block(block_size);
00323         new_block->_next = _tail->_next;
00324         _tail = _tail->_next = new_block;
00325     }
00326     else {
00327         // compress the run, if any
00328         block_of_chips* prev = 0;
00329         block_of_chips* cur = _tail;
00330         block_of_chips* next;
00331         while(1) {
00332             next = cur->_next;
00333             /* make all zombies in the block usable */
00334             next->recycle();
00335 
00336             /* now see how many of the chips are still in use. If too few,
00337              * move the tail, set hit count to 0
00338              * and return so that the client ends up calling us again
00339              * and acquiring another block to become the new tail.
00340              */
00341             if(next->_bits.usable_count() <= max_available) {
00342                 // cause the tail to move to avoid certain perverse
00343                 // behavior when the usable count is 0 but
00344                 // chips have been freed at the front of the
00345                 // list.  We don't
00346                 // want to forever allocate new blocks just because there's
00347                 // one fully-allocated block in the list.
00348                 // By moving the tail, it slowly circulates through the
00349                 // list and our first inspection will be of a *diferent* block
00350                 // after the newly allocated block is consumed.
00351                 cur = next;
00352                 break;
00353             }
00354             
00355             // This block has plenty of usable chips. Enough in fact
00356             // that it's worth releasing it to the underlying pool.
00357             if(prev) {
00358                 assert(prev != cur);
00359                 // assert(cur->_bits.usable_count() > max_available);
00360                 assert(next->_bits.usable_count() > max_available);
00361                 prev->_next = next;
00362                 _pool->release_block(cur);
00363                 cur = prev; // reset
00364             }
00365 
00366             // avoid the endless loop
00367             if(next == _tail) break;
00368             
00369             prev = cur;
00370             cur = next;
00371         } // while
00372 
00373         // recycle, repair the ring, and advance
00374         // NB: if we broke out of the while loop on the first try,
00375         // we will not mave moved the tail at all.
00376         _tail = cur;
00377     }
00378 
00379     // # fast acquires(hits) since last _change_blocks
00380     _hit_count = 0;
00381 }
00382 
00383 /* recycle all blocks in the list.  This is to force 
00384  * recycling of the heap after a spike of releases,
00385  * w/o waiting for the next acquire.
00386  */
00387 void block_list::recycle(size_t chip_size, 
00388         size_t chip_count, size_t block_size)
00389 {
00390     _hit_count = chip_count; // artificially set _hit_count
00391     // so when we call _change_block we can't possibly
00392     // add a block to the list, and we try to compress instead.
00393     _change_blocks(chip_size, chip_count, block_size);
00394 }
00395 
00396 block_list::~block_list() {
00397 
00398     // don't free the fake block if we went unused!
00399     if(_tail == &_fake_block) return;
00400 
00401     // break the cycle so the loop terminates
00402     block_of_chips* cur = _tail->_next;
00403     _tail->_next = 0;
00404 
00405     // release blocks until we hit the NULL
00406     while( (cur=_pool->release_block(cur)) )  {
00407     }
00408 }
00409 
00410 /**\endcond skip */
00411 } // namespace memory_block