Shore Storage Manager: src/fc/mem

00001 /* -*- mode:C++; c-basic-offset:4 -*-
00002      Shore-MT -- Multi-threaded port of the SHORE storage manager
00003    
00004                        Copyright (c) 2007-2009
00005       Data Intensive Applications and Systems Labaratory (DIAS)
00006                Ecole Polytechnique Federale de Lausanne
00007    
00008                          All Rights Reserved.
00009    
00010    Permission to use, copy, modify and distribute this software and
00011    its documentation is hereby granted, provided that both the
00012    copyright notice and this permission notice appear in all copies of
00013    the software, derivative works or modified versions, and any
00014    portions thereof, and that both notices appear in supporting
00015    documentation.
00016    
00017    This code is distributed in the hope that it will be useful, but
00018    WITHOUT ANY WARRANTY; without even the implied warranty of
00019    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS
00020    DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER
00021    RESULTING FROM THE USE OF THIS SOFTWARE.
00022 */
00023 /*<std-header orig-src='shore' incl-file-exclusion='MEM_BLOCK_CPP'>
00024 
00025  $Id: mem_block.cpp,v 1.8 2010/12/08 17:37:37 nhall Exp $
00026 
00027 SHORE -- Scalable Heterogeneous Object REpository
00028 
00029 Copyright (c) 1994-99 Computer Sciences Department, University of
00030                       Wisconsin -- Madison
00031 All Rights Reserved.
00032 
00033 Permission to use, copy, modify and distribute this software and its
00034 documentation is hereby granted, provided that both the copyright
00035 notice and this permission notice appear in all copies of the
00036 software, derivative works or modified versions, and any portions
00037 thereof, and that both notices appear in supporting documentation.
00038 
00039 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY
00040 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS
00041 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND
00042 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
00043 
00044 This software was developed with support by the Advanced Research
00045 Project Agency, ARPA order number 018 (formerly 8230), monitored by
00046 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518.
00047 Further funding for this work was provided by DARPA through
00048 Rome Research Laboratory Contract No. F30602-97-2-0247.
00049 
00050 */
00051 
00052 /**\cond skip */
00053 
00054 #include "w.h"
00055 #include "mem_block.h"
00056 #include "atomic_ops.h"
00057 #include <cstdlib>
00058 #include <stdio.h>
00059 #include <algorithm>
00060 #ifdef __linux
00061 #include <malloc.h>
00062 #endif
00063 
00064 // #include <cassert>
00065 #undef assert
00066 void assert_failed(const char *desc, const char *f, int l) {
00067     fprintf(stdout, "Assertion failed: %s at line %d file %s ", desc, l,f);
00068     w_assert0(0);
00069 }
00070 #define assert(x)   if (!(x)) assert_failed(#x, __FILE__, __LINE__);
00071 
00072 #define TEMPLATE_ARGS chip_size, chip_count, block_size
00073 
00074 #ifdef __GNUC__
00075 #if defined(__x86_64) || defined(i386) || defined(__i386__)
00076 #define NEED_POPC64 0
00077 #elif defined(__sparcv9)
00078 #define NEED_POPC64 0
00079 #else
00080 #define NEED_POPC64 1
00081 #endif
00082 #else // !defined(__GNUC__)
00083 #define NEED_POPC64 1
00084 #endif
00085 
00086 namespace memory_block {
00087 #if 0 /* keep emacs happy */
00088 } // keep emacs happy...
00089 #endif
00090 
00091 #if NEED_POPC64==1
00092 // adapted from http://chessprogramming.wikispaces.com/Population+Count#SWAR-Popcount
00093 typedef unsigned long long u64;
00094 static inline
00095 long popc64(u64 x) {
00096     u64 k1 = 0x5555555555555555ull;
00097     u64 k2 = 0x3333333333333333ull;
00098     u64 k4 = 0x0f0f0f0f0f0f0f0full;
00099     u64 kf = 0x0101010101010101ull;
00100     x =  x       - ((x >> 1)  & k1); //put count of each 2 bits into those 2 bits
00101     x = (x & k2) + ((x >> 2)  & k2); //put count of each 4 bits into those 4 bits
00102     x = (x       +  (x >> 4)) & k4 ; //put count of each 8 bits into those 8 bits
00103     x = (x * kf) >> 56; //returns 8 most significant bits of x + (x<<8) + (x<<16) + (x<<24) + ...
00104     return x;
00105 }
00106 #endif
00107 
00108 size_t        block_bits::_popc(bitmap bm) {
00109 #if NEED_POPC64==1
00110 #warning "using home-brew popc routine"
00111     return popc64(bm);
00112 #elif defined(__x86_64) || defined(i386) || defined(__i386__)
00113 // #warning "Using __builtin_popcountll"
00114     return __builtin_popcountll(bm);
00115 #elif defined(__sparcv9)
00116 #warning "Using gcc inline asm to access sparcv9 'popc' instruction"
00117     long rval;
00118     __asm__("popc    %[in], %[out]" : [out] "=r"(rval) : [in] "r"(x));
00119     return rval;
00120 #else
00121 #error "NEED_POPC64: Platform not supported"
00122 #endif 
00123 }
00124 
00125 block_bits::block_bits(size_t chip_count)
00126     : _usable_chips(create_mask(chip_count))
00127     , _zombie_chips(0)
00128 {
00129     assert(chip_count <= 8*sizeof(bitmap));
00130 }
00131 
00132 size_t block_bits::acquire_contiguous(size_t chip_count) {
00133     (void) chip_count; // make gcc happy...
00134     
00135     /* find the rightmost set bit.
00136 
00137        If the map is smaller than the word size, but logically full we
00138        will compute an index equal to the capacity. If the map is
00139        physically full (_available == 0) then we'll still compute an
00140        index equal to capacity because 0-1 will underflow to all set
00141        bits. Therefore the check for full is the same whether the
00142        bitmap can use all its bits or not.
00143      */
00144     bitmap one_bit = _usable_chips &- _usable_chips;
00145     size_t index = _popc(one_bit-1);
00146     if(index < 8*sizeof(bitmap)) {
00147         // common case: we have space
00148         assert(index < chip_count);
00149         _usable_chips ^= one_bit;
00150     }
00151     else {
00152         // oops... full
00153         assert(index == 8*sizeof(bitmap));
00154     }
00155 
00156     return index;
00157 }
00158 
00159 void block_bits::release_contiguous(size_t index, size_t chip_count) {
00160     // assign this chip to the zombie set for later recycling
00161     (void) chip_count; // keep gcc happy
00162     assert(index < chip_count);
00163     bitmap to_free = bitmap(1) << index;
00164     assert(! (to_free & *usable_chips()));
00165     membar_exit();
00166     bitmap volatile* ptr = &_zombie_chips;
00167     bitmap ov = *ptr;
00168     while(1) {
00169         bitmap nv = ov | to_free;
00170         bitmap cv = atomic_cas_64(ptr, ov, nv);
00171         if(cv == ov)
00172             break;
00173         ov = cv;
00174     }
00175     bitmap was_free = ov;
00176     (void) was_free; // keep gcc happy
00177 
00178     assert( ! (was_free & to_free));
00179 }
00180 
00181 block_bits::bitmap block_bits::create_mask(size_t bits_set) {
00182     // doing it this way allows us to create a bitmap of all ones if need be
00183     return ~bitmap(0) >> (8*sizeof(bitmap) - bits_set);
00184 }
00185 
00186 void block_bits::recycle() {
00187     /* recycle the block.
00188 
00189        Whatever bits have gone zombie since we last recycled become
00190        the new set of usable bits. We also XOR them atomically back
00191        into the zombie set to clear them out there. That way we don't
00192        leak bits if a releasing thread races us and adds more bits to the
00193        zombie set after we read it.
00194     */
00195     bitmap newly_usable = *&_zombie_chips;
00196     _usable_chips |= newly_usable;
00197     membar_exit();
00198     bitmap volatile* ptr = &_zombie_chips;
00199     bitmap ov = *ptr;
00200     while(1) {
00201         bitmap nv = ov ^ newly_usable;
00202         bitmap cv = atomic_cas_64(ptr, ov, nv);
00203         if(cv == ov)
00204             break;
00205         ov = cv;
00206     }
00207 }
00208 
00209 void* block::acquire_chip(size_t chip_size, size_t chip_count, size_t /*block_size*/) {
00210     size_t index = _bits.acquire_contiguous(chip_count); 
00211     return (index < chip_count)? _get(index, chip_size) : 0;
00212 }
00213 
00214 void block::release_chip(void* ptr, size_t chip_size, size_t chip_count, size_t block_size)
00215 {
00216     /* use pointer arithmetic to find the beginning of our block,
00217        where the block* lives.
00218 
00219        Our caller is responsible to be sure this address actually
00220        falls inside a memory block
00221      */
00222     union { void* v; size_t n; block* b; char* c; } u = {ptr}, v=u;
00223     u.n &= -block_size;
00224     size_t offset = v.c - u.b->_data;
00225     size_t idx = offset/chip_size;
00226     assert(u.b->_data + idx*chip_size == ptr);
00227     u.b->_bits.release_contiguous(idx, chip_count);
00228 }
00229 
00230 char* block::_get(size_t index, size_t chip_size) {
00231     return _data + index*chip_size;
00232 }
00233 
00234 block::block(size_t chip_size, size_t chip_count, size_t block_size)
00235     : _bits(chip_count)
00236     , _owner(0)
00237     , _next(0)
00238 {
00239     // make sure all the chips actually fit in this block
00240     char* end_of_block = _get(0, chip_size)-sizeof(this)+block_size;
00241     char* end_of_chips = _get(chip_count, chip_size);
00242     (void) end_of_block; // keep gcc happy
00243     (void) end_of_chips; // keep gcc happy
00244     assert(end_of_chips <= end_of_block);
00245     
00246     /* We purposefully don't check alignment here because some parts
00247        of the impl cheat for blocks which will never be used to
00248        allocate anything (the fake_block being the main culprit).
00249        The block_pool does check alignment, though.
00250      */
00251 }
00252 
00253 void* block_list::acquire(size_t chip_size, size_t chip_count, size_t block_size)
00254 {
00255     if(void* ptr = _tail->acquire_chip(TEMPLATE_ARGS)) {
00256         _hit_count++;
00257         return ptr;
00258     }
00259 
00260     // darn... gotta do it the hard way
00261     return _slow_acquire(TEMPLATE_ARGS);
00262 }
00263 
00264 
00265 block_list::block_list(block_pool* pool, size_t chip_size, size_t chip_count, size_t block_size)
00266     : _fake_block(TEMPLATE_ARGS)
00267     , _tail(&_fake_block)
00268     , _pool(pool)
00269     , _hit_count(0)
00270     , _avg_hit_rate(0)
00271 {
00272     /* make the fake block advertise that it has nothing to give
00273 
00274        The first time the user tries to /acquire/ the fast case will
00275        detect that the fake block is "full" and fall back to the slow
00276        case. The slow case knows about the fake block and will remove
00277        it from the list.
00278 
00279        This trick lets us minimize the number of branches required by
00280        the fast path acquire.
00281      */
00282     _fake_block._bits._usable_chips = 0;
00283     _fake_block._bits._zombie_chips = 0;
00284 }
00285 
00286 
00287 void* block_list::_slow_acquire(size_t chip_size, 
00288         size_t chip_count, size_t block_size)
00289 {
00290     _change_blocks(TEMPLATE_ARGS);
00291     return acquire(TEMPLATE_ARGS);
00292 }
00293 
00294 block* block_list::acquire_block(size_t block_size)
00295 {
00296     union { block* b; uintptr_t n; } u = {_pool->acquire_block(this)};
00297     (void) block_size; // keep gcc happy
00298     assert((u.n & -block_size) == u.n);
00299     return u.b;
00300     
00301 }
00302 void block_list::_change_blocks(size_t chip_size, 
00303         size_t chip_count, size_t block_size)
00304 {
00305     (void) chip_size; // keep gcc happy
00306 
00307     // first time through?
00308     if(_tail == &_fake_block) {
00309         _tail = acquire_block(block_size);
00310         _tail->_next = _tail;
00311         return;
00312     }
00313     
00314     /* Check whether we're chewing through our blocks too fast for the
00315        current ring size
00316 
00317        If we consistently recycle blocks while they're still more than
00318        half full then we need a bigger ring so old blocks have more
00319        time to cool off between reuses.
00320        
00321        To respond to end-of-spike gracefully, we're going to be pretty
00322        aggressive about returning blocks to the global pool: when
00323        recycling blocks we check for runs of blocks which do not meet
00324        some minimum utilization threshold, discarding all but one of
00325        them (keep the last for buffering purposes).
00326     */
00327     static double const    decay_rate = 1./5; // consider (roughly) the last 5 blocks
00328     // too few around suggests we should unload some extra blocks 
00329     size_t const max_available = chip_count - std::max((int)(.1*chip_count), 1);
00330     // more than 50% in-use suggests we've got too few blocks
00331     size_t const min_allocated = (chip_count+1)/2;
00332     
00333     _avg_hit_rate = _hit_count*(1-decay_rate) + _avg_hit_rate*decay_rate;
00334     if(_hit_count < min_allocated && _avg_hit_rate < min_allocated) {
00335         // too fast.. grow the ring
00336         block* new_block = acquire_block(block_size);
00337         new_block->_next = _tail->_next;
00338         _tail = _tail->_next = new_block;
00339     }
00340     else {
00341         // compress the run, if any
00342         block* prev = 0;
00343         block* cur = _tail;
00344         block* next;
00345         while(1) {
00346             next = cur->_next;
00347             next->recycle();
00348             if(next->_bits.usable_count() <= max_available)
00349             break;
00350             
00351             // compression possible?
00352             if(prev) {
00353                 assert(prev != cur);
00354                 assert(cur->_bits.usable_count() > max_available);
00355                 assert(next->_bits.usable_count() > max_available);
00356                 prev->_next = next;
00357                 _pool->release_block(cur);
00358                 cur = prev; // reset
00359             }
00360 
00361             // avoid the endless loop
00362             if(next == _tail) break;
00363             
00364             prev = cur;
00365             cur = next;
00366         }
00367 
00368         // recycle, repair the ring, and advance
00369         _tail = cur;
00370     }
00371 
00372     _hit_count = 0;
00373 }
00374 
00375 block_list::~block_list() {
00376     // don't free the fake block if we went unused!
00377     if(_tail == &_fake_block) return;
00378     
00379     // break the cycle so the loop terminates
00380     block* cur = _tail->_next;
00381     _tail->_next = 0;
00382 
00383     // release blocks until we hit the NULL
00384     while( (cur=_pool->release_block(cur)) ) ;
00385 }
00386 
00387 /**\endcond skip */
00388 } // namespace memory_block