io.cpp

00001 /* -*- mode:C++; c-basic-offset:4 -*-
00002      Shore-MT -- Multi-threaded port of the SHORE storage manager
00003    
00004                        Copyright (c) 2007-2009
00005       Data Intensive Applications and Systems Labaratory (DIAS)
00006                Ecole Polytechnique Federale de Lausanne
00007    
00008                          All Rights Reserved.
00009    
00010    Permission to use, copy, modify and distribute this software and
00011    its documentation is hereby granted, provided that both the
00012    copyright notice and this permission notice appear in all copies of
00013    the software, derivative works or modified versions, and any
00014    portions thereof, and that both notices appear in supporting
00015    documentation.
00016    
00017    This code is distributed in the hope that it will be useful, but
00018    WITHOUT ANY WARRANTY; without even the implied warranty of
00019    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS
00020    DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER
00021    RESULTING FROM THE USE OF THIS SOFTWARE.
00022 */
00023 
00024 /*<std-header orig-src='shore'>
00025 
00026  $Id: io.cpp,v 1.43 2010/08/03 14:24:52 nhall Exp $
00027 
00028 SHORE -- Scalable Heterogeneous Object REpository
00029 
00030 Copyright (c) 1994-99 Computer Sciences Department, University of
00031                       Wisconsin -- Madison
00032 All Rights Reserved.
00033 
00034 Permission to use, copy, modify and distribute this software and its
00035 documentation is hereby granted, provided that both the copyright
00036 notice and this permission notice appear in all copies of the
00037 software, derivative works or modified versions, and any portions
00038 thereof, and that both notices appear in supporting documentation.
00039 
00040 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY
00041 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS
00042 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND
00043 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
00044 
00045 This software was developed with support by the Advanced Research
00046 Project Agency, ARPA order number 018 (formerly 8230), monitored by
00047 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518.
00048 Further funding for this work was provided by DARPA through
00049 Rome Research Laboratory Contract No. F30602-97-2-0247.
00050 
00051 */
00052 
00053 #include "w_defines.h"
00054 
00055 /*  -- do not edit anything above this line --   </std-header>*/
00056 
00057 
00058 /*
00059  *   NewThreads is Copyright 1992, 1993, 1994, 1995, 1996, 1997 by:
00060  *
00061  *    Josef Burger    <bolo@cs.wisc.edu>
00062  *    Dylan McNamee    <dylan@cse.ogi.edu>
00063  *      Ed Felten       <felten@cs.princeton.edu>
00064  *
00065  *   All Rights Reserved.
00066  *
00067  *   NewThreads may be freely used as long as credit is given
00068  *   to the above authors and the above copyright is maintained.
00069  */
00070 
00071 /**\cond skip */
00072 #define    IO_C
00073 
00074 #include <w.h>
00075 #include <w_debug.h>
00076 #include <w_stream.h>
00077 #include <cstdlib>
00078 #include <cstring>
00079 #include <sys/time.h>
00080 #include <sys/wait.h>
00081 #include <new>
00082 #include <sys/stat.h>
00083 #include <sys/mman.h>
00084 #include <w_rusage.h>
00085 #if defined(HAVE_HUGETLBFS)
00086 #include <fcntl.h>
00087 #endif
00088 #include "sthread.h"
00089 #include "sthread_stats.h"
00090 #include <sdisk.h>
00091 #include <sdisk_unix.h>
00092 
00093 #if defined(HUGEPAGESIZE) && (HUGEPAGESIZE == 0)
00094 #undef HUGEPAGESIZE
00095 #endif
00096 
00097 extern class sthread_stats SthreadStats;
00098 
00099 sdisk_t         **sthread_t::_disks = 0;
00100 unsigned        sthread_t::open_max = 0;
00101 unsigned        sthread_t::open_count = 0;
00102 
00103 static          queue_based_lock_t    protectFDs;
00104 
00105 int       sthread_t:: _disk_buffer_disalignment(0); 
00106 size_t    sthread_t:: _disk_buffer_size(0); 
00107 char *    sthread_t:: _disk_buffer (NULL);
00108 
00109 int sthread_t::do_unmap()
00110 {
00111     // munmap isn't strictly necessary since this will
00112     // cause the sm to croak and the mapping will be done at
00113     // process-end
00114 
00115 #ifdef WITHOUT_MMAP
00116     ::free( _disk_buffer - _disk_buffer_disalignment ); 
00117     _disk_buffer = NULL;
00118     _disk_buffer_disalignment = 0;
00119     return 0;
00120 #endif
00121 
00122 #if 0
00123     fprintf(stderr, "%d: munmap disalignment %d addr %p, size %lu\n",
00124             __LINE__, 
00125             _disk_buffer_disalignment,  
00126             ( _disk_buffer -  _disk_buffer_disalignment),  
00127             _disk_buffer_size);
00128 #endif
00129 
00130     int err =
00131         munmap( _disk_buffer -  _disk_buffer_disalignment,  _disk_buffer_size);
00132 
00133     if(err) {
00134         cerr << "munmap returns " << err 
00135             << " errno is " <<  errno  << " " << strerror(errno)
00136             << endl;
00137         w_assert1(!err);
00138     }
00139 
00140      _disk_buffer = NULL;
00141      _disk_buffer_size = 0;
00142      _disk_buffer_disalignment = 0;
00143 
00144     return err;
00145 }
00146 
00147 void sthread_t::align_for_sm(size_t W_IFDEBUG1(requested_size))
00148 {
00149     char * _disk_buffer2  = (char *)alignon( _disk_buffer, SM_PAGESIZE);
00150     if( _disk_buffer2 !=  _disk_buffer) 
00151     {
00152         // We made the size big enough that we can align it here
00153         _disk_buffer_disalignment = ( _disk_buffer2 -  _disk_buffer);
00154         w_assert1( _disk_buffer_disalignment < SM_PAGESIZE);
00155         w_assert1( _disk_buffer_size -  _disk_buffer_disalignment 
00156             >= requested_size);
00157 
00158          _disk_buffer =  _disk_buffer2;
00159 
00160     }
00161 }
00162 
00163 long sthread_t::get_max_page_size(long system_page_size)
00164 {
00165     long max_page_size = 0;
00166 #ifdef HAVE_GETPAGESIZES
00167     {
00168         int nelem = getpagesizes(NULL, 0);
00169         if(nelem >= 0) {
00170             size_t *pagesize = new size_t[nelem];
00171             int err = getpagesizes (pagesize, nelem);
00172             if(err >= 0) {
00173                 for(int i=0; i < nelem; i++) {
00174                    if ( pagesize[i] > max_page_size) { 
00175                        max_page_size = pagesize[i];
00176            }
00177                 }
00178             } else {
00179            cerr << "getpagesizes(pagesize, " << nelem << ") failed. "
00180             << " errno is " <<  errno  << " " << strerror(errno)
00181             << endl;
00182             }
00183             delete[] pagesize;
00184         } else {
00185            cerr << "getpagesizes(NULL,0) failed. "
00186             << " errno is " <<  errno  << " " << strerror(errno)
00187            << endl;
00188         }
00189     }
00190 #else
00191     max_page_size = system_page_size;
00192 #endif
00193     /*
00194     cerr << "Max    page size is " << max_page_size
00195         << "( " << int(max_page_size/1024) << " KB) " << endl;
00196     cerr << "System page size is " << system_page_size 
00197         << "( " << int(system_page_size/1024) << " KB) " << endl;
00198     */
00199     return max_page_size;
00200 }
00201 
00202 void sthread_t::align_bufsize(size_t size, long system_page_size,
00203                                                 long max_page_size)
00204 {
00205     // ***********************************************************
00206     //
00207     //  PROPERLY ALIGN ARGUMENTS TO MMAP
00208     //
00209     // The max page size should be a multiple of the system page size -
00210     // that should be a given.
00211 
00212     w_assert0(alignon(max_page_size, system_page_size) == max_page_size);
00213     //
00214     // The size requested must be multiples of
00215     // the page size to be used as well as of the system page size,
00216     // and while it doesn't have to be a multiple of the SM page
00217     // size, it must at least accommodate the size requested, which
00218     // is a multiple of the SM page size.
00219     // ***********************************************************
00220     _disk_buffer_size  = alignon(size, max_page_size);
00221     w_assert1(_disk_buffer_size >= size); // goes without saying
00222 
00223     // should now be aligned on both page sizes
00224     w_assert1(size_t(alignon(_disk_buffer_size, max_page_size)) 
00225         == _disk_buffer_size);
00226     w_assert1(size_t(alignon(_disk_buffer_size, system_page_size)) 
00227         == _disk_buffer_size);
00228 }
00229 
00230 #if defined(HAVE_HUGETLBFS) && defined(HUGEPAGESIZE) && (HUGEPAGESIZE > 0)
00231 void clear(char *buf_start, size_t requested_size)
00232 {
00233     // Try reading first: do it in pages
00234     size_t requested_huge_pages = requested_size / (HUGEPAGESIZE*1024);
00235     size_t requested_pages = requested_size / SM_PAGESIZE;
00236     for(size_t j=0; j < requested_pages; j++) {
00237         for(size_t i=0; i < SM_PAGESIZE; i++) {
00238             size_t offset = j*SM_PAGESIZE + i;
00239             size_t hugepagenum =  offset / (HUGEPAGESIZE*1024);
00240             size_t hugepageoffset =  offset - (hugepagenum * 
00241                     (HUGEPAGESIZE*1024));
00242             char x = buf_start[offset];
00243             
00244             // shut the compiler up:
00245             if(int(i) < 0) fprintf(stderr, "0x%d 0x%x, 0x%x, 0x%x", x,
00246                     int(hugepagenum), int(hugepageoffset), int(requested_huge_pages));    
00247         }
00248     }
00249     
00250 #if W_DEBUG_LEVEL > 4
00251     fprintf(stderr, "clearing %ld bytes starting at %p\n", 
00252             requested_size, buf_start); 
00253 #endif
00254     memset(buf_start, 0, requested_size);
00255 }
00256 #else
00257 void clear(char *buf_start, size_t requested_size )
00258 {
00259     memset(buf_start, 0, requested_size);
00260 }
00261 #endif
00262 
00263 
00264 w_rc_t sthread_t::set_bufsize_normal(
00265     size_t size, char *&buf_start /* in/out*/, long system_page_size)
00266 {
00267     size_t requested_size = size; // save for asserts later
00268 
00269     // ***********************************************************
00270     //
00271     //  GET PAGE SIZES
00272     //
00273     // ***********************************************************
00274     long max_page_size = get_max_page_size(system_page_size);
00275     w_assert1(system_page_size <= max_page_size); 
00276 
00277     // ***********************************************************
00278     //
00279     //  GET FILE DESCRIPTOR FOR MMAP
00280     //
00281     // ***********************************************************
00282     int fd(-1); // must be -1 if not mapping to a file
00283 
00284     // ***********************************************************
00285     //
00286     //  GET FLAGS FOR MMAP
00287     //
00288     // If posix mmapped file are available, _POSIX_MAPPED_FILES is defined
00289     // in <unistd.h> to be > 0
00290     //
00291     // That should give you these flags:
00292     // MAP_FIXED, MAP_PRIVATE, MAP_NORESERVE, MAP_ANONYMOUS
00293     // If MAP_ANONYMOUS is not there, MAP_ANON might be.
00294     //
00295     // However... systems aren't exactly in sync here, so configure.ac
00296     // checks for each of these flags.
00297     //
00298     // ***********************************************************
00299     int flags1 = MAP_PRIVATE;
00300     int flags2 = MAP_PRIVATE;
00301 
00302 #if HAVE_DECL_MAP_ANONYMOUS==1
00303     flags1  |= MAP_ANONYMOUS;
00304     flags2  |= MAP_ANONYMOUS;
00305 #elif HAVE_DECL_MAP_ANON==1
00306     flags1  |= MAP_ANON;
00307     flags2  |= MAP_ANON;
00308 #else
00309 #endif
00310 
00311 #if HAVE_DECL_MAP_NORESERVE==1
00312     flags1  |= MAP_NORESERVE;
00313 #endif
00314 #if HAVE_DECL_MAP_FIXED==1
00315     flags2  |= MAP_FIXED;
00316 #endif
00317 
00318 #if HAVE_DECL_MAP_ALIGN==1
00319     flags1 |= MAP_ALIGN;
00320 #endif
00321     // add one SM_PAGESIZE to the size requested before alignment,
00322     // and then do our own alignment at the end
00323     // In the case of MAP_ALIGN this shouldn't be necessary, but
00324     // we have so many different cases, it's going to be unreadable
00325     // if we try to avoid this in the one case, so do it in every case.
00326     size += SM_PAGESIZE;
00327     align_bufsize(size, system_page_size, max_page_size);
00328 
00329     // ***********************************************************
00330     //
00331     // FIRST MMAP: get a mapped region from the kernel.
00332     // If we are using hugetlbfs, fd will be >= 0 and
00333     // we won't have to do the remap -- the first mapping will
00334     // give us the best page sizes we can get.  In that case,
00335     // skip the first mmap and do exactly one "second mmap"
00336     //
00337     // ***********************************************************
00338 
00339     errno = 0;
00340     _disk_buffer = (char*) mmap(0, _disk_buffer_size,
00341                PROT_NONE,
00342                flags1,
00343                fd,   /* fd */
00344                0     /* off_t */
00345                );
00346 
00347     if (_disk_buffer == MAP_FAILED) {
00348         cerr 
00349             << __LINE__ << " " 
00350             << "mmap (size=" << _disk_buffer_size 
00351             << " = " << int(_disk_buffer_size/1024)
00352             << " KB ) returns " << long(_disk_buffer)
00353             << " errno is " <<  errno  << " " << strerror(errno)
00354             << " flags " <<  flags1  
00355             << " fd " <<  fd  
00356             << endl;
00357         return RC(fcMMAPFAILED);
00358     }
00359 #if W_DEBUG_LEVEL > 4
00360     else
00361     {
00362         cerr 
00363             << __LINE__ << " " 
00364             << "mmap SUCCESS! (size=" << _disk_buffer_size 
00365             << " = " << int(_disk_buffer_size/1024)
00366             << " KB ) returns " << long(_disk_buffer)
00367             << " errno is " <<  errno  << " " << strerror(errno)
00368             << " flags " <<  flags1  
00369             << " fd " <<  fd  
00370             << endl;
00371     }
00372 #endif
00373 
00374 
00375     // ***********************************************************
00376     //
00377     // RE-MMAP: break up the mapped region into max_page_size
00378     // chunks and remap them.
00379     //
00380     // ***********************************************************
00381     int nchunks = _disk_buffer_size / max_page_size;
00382     w_assert1(size_t(nchunks * max_page_size) == _disk_buffer_size);
00383 
00384 
00385     for(int i=0; i < nchunks; i++)
00386     {
00387         char *addr = _disk_buffer + (i * max_page_size); 
00388         char *sub_buffer = (char*) mmap(addr, 
00389                max_page_size,
00390                        PROT_READ | PROT_WRITE, /* prot */
00391                        flags2,
00392                        fd,   /* fd */
00393                        0     /* off_t */
00394                        );
00395 
00396         if (sub_buffer == MAP_FAILED) {
00397             cerr 
00398                 << __LINE__ << " " 
00399                 << "mmap (addr=" << long(addr )
00400                 << ", size=" << max_page_size << ") returns -1;"
00401                 << " errno is " <<  errno  << " " << strerror(errno)
00402                 << " flags " <<  flags2  
00403                 << " fd " <<  fd  
00404                 << endl;
00405             do_unmap();
00406             return RC(fcMMAPFAILED);
00407         }
00408         w_assert1(sub_buffer == addr);
00409 #ifdef HAVE_MEMCNTL
00410         struct memcntl_mha info;
00411         info.mha_cmd = MHA_MAPSIZE_VA;
00412         info.mha_flags = 0;
00413         info.mha_pagesize = max_page_size;
00414         // Ask the kernel to use the max page size here
00415         if(memcntl(sub_buffer, max_page_size, MC_HAT_ADVISE, (char *)&info, 0, 0) < 0)
00416        
00417         {
00418             cerr << "memcntl (chunk " << i << ") returns -1;"
00419                 << " errno is " <<  errno  << " " << strerror(errno)
00420                 << " requested size " <<  max_page_size  << endl;
00421             do_unmap();
00422             return RC(fcMMAPFAILED);
00423         }
00424 #endif
00425     }
00426 
00427     align_for_sm(requested_size);
00428     buf_start = _disk_buffer;
00429     clear(buf_start, requested_size);
00430     return RCOK;
00431 }
00432 
00433 #ifdef WITHOUT_MMAP
00434 w_rc_t 
00435 sthread_t::set_bufsize_memalign(size_t size, char *&buf_start /* in/out*/,
00436     long system_page_size)
00437 {
00438     size_t requested_size = size; // save for asserts later
00439 
00440     // ***********************************************************
00441     //
00442     //  GET PAGE SIZES
00443     //
00444     // ***********************************************************
00445 
00446     long max_page_size = system_page_size;
00447 
00448     align_bufsize(size, system_page_size, max_page_size);
00449 
00450     w_assert1(_disk_buffer == NULL);
00451 
00452 #ifdef HAVE_POSIX_MEMALIGN
00453     void *addr;
00454     int e = posix_memalign(&addr, SM_PAGESIZE, size);
00455     if (e == 0) {
00456         _disk_buffer = (char *)addr;
00457     } else {
00458         _disk_buffer = 0;
00459     }
00460 #elif  HAVE_MEMALIGN
00461     _disk_buffer =  (char *)memalign(SM_PAGESIZE, size);
00462 #elif  HAVE_VALLOC
00463     size += SM_PAGESIZE; // for alignment, add  a page and align it after.
00464     _disk_buffer =  valloc(size);
00465 #else
00466     size += SM_PAGESIZE; // for alignment, add  a page and align it after.
00467     _disk_buffer =  malloc(size);
00468 #endif
00469     if (_disk_buffer == 0) {
00470         cerr 
00471             << __LINE__ << " " 
00472             << "could not allocate memory (alignment=" << SM_PAGESIZE 
00473         << "," << size << ") returns -error;"
00474             << " errno is " << strerror(errno)
00475             << endl;
00476         return RC(fcINTERNAL);
00477     }
00478     align_for_sm(requested_size);
00479     buf_start = _disk_buffer;
00480     clear(buf_start, requested_size);
00481     return RCOK;
00482 }
00483 #endif
00484 
00485 #if defined(HAVE_HUGETLBFS)
00486 
00487 #if HUGEPAGESIZE>0
00488 #else
00489 #   error You have configured to use hugetlbfs but you have no hugepagesize
00490 #   error Look for Hugepagesize in /proc/meminfo
00491 #endif
00492 
00493 static const char *hugefs_path(NULL);
00494 w_rc_t 
00495 sthread_t::set_hugetlbfs_path(const char *what) 
00496 { 
00497     if(strcmp(what, "NULL")==0) {
00498         // Do not use tlbfs
00499         hugefs_path = NULL;
00500         return RCOK;
00501     }
00502 
00503     // stat the path to make sure it at least exists.
00504     // TODO: check the permissions and all that
00505     struct stat statbuf;
00506     int e=stat(what, &statbuf);
00507     if(e) {
00508         fprintf(stderr, "Could not stat \"%s\"\n", what);
00509         int fd = ::open(what, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
00510         if (fd < 0) {
00511             fprintf(stderr, "Could not create \"%s\"\n", what);
00512             return RC(stBADPATH);
00513         } else {
00514             cerr << " created " << what << endl;
00515         }
00516     }
00517     hugefs_path = what; 
00518     // fprintf(stderr, "path is %s\n", hugefs_path);
00519     return RCOK;
00520 }
00521 
00522 w_rc_t 
00523 sthread_t::set_bufsize_huge(
00524     size_t size, 
00525     char *&buf_start /* in/out*/,
00526     long system_page_size)
00527 {
00528     size_t requested_size = size; // save for asserts later
00529 
00530     // ***********************************************************
00531     //
00532     //  GET PAGE SIZES
00533     //
00534     // ***********************************************************
00535 
00536     long max_page_size = 1024 * HUGEPAGESIZE; 
00537     // I don't know how to get this programatically
00538 
00539     w_assert1(system_page_size <= max_page_size); 
00540 
00541     // ***********************************************************
00542     //
00543     //  GET FILE DESCRIPTOR FOR MMAP
00544     //
00545     // ***********************************************************
00546     // TODO: verify that this file can be multiply mapped
00547     // by diff users (i.e., don't need unique file name for each sm)
00548 
00549 
00550     if(hugefs_path == NULL)
00551     {
00552         fprintf(stderr, "path is %s\n", hugefs_path);
00553         fprintf(stderr, 
00554             "Need path to huge fs. Use ::set_hugetlbfs_path(path)\n");
00555         return RC(fcMMAPFAILED);
00556     }
00557     int fd = ::open(hugefs_path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
00558     if (fd < 0) {
00559         cerr << " could not open " << hugefs_path << endl;
00560         return RC(fcMMAPFAILED);
00561     }
00562 
00563     // ***********************************************************
00564     //
00565     //  GET FLAGS FOR MMAP
00566     //
00567     // If posix mmapped file are available, _POSIX_MAPPED_FILES is defined
00568     // in <unistd.h> to be > 0
00569     //
00570     // That should give you these flags:
00571     // MAP_FIXED, MAP_PRIVATE, MAP_NORESERVE, MAP_ANONYMOUS
00572     // If MAP_ANONYMOUS is not there, MAP_ANON might be.
00573     //
00574     // However... systems aren't exactly in sync here, so configure.ac
00575     // checks for each of these flags.
00576     //
00577     // ***********************************************************
00578     int flags = 
00579         MAP_PRIVATE;
00580 
00581     /* NOTE: cannot use ANONYMOUS for hugetlbfs*/
00582 
00583 #if HAVE_DECL_MAP_ALIGN==1
00584     flags |=  MAP_ALIGN;
00585     fprintf(stderr, "%d: adding flag 0x%x %s\n", __LINE__,
00586             MAP_ALIGN, "MAP_ALIGN");
00587 #endif
00588     // add one SM_PAGESIZE to the size requested before alignment,
00589     // and then do our own alignment at the end
00590     // In the case of MAP_ALIGN this shouldn't be necessary, but
00591     // we have so many different cases, it's going to be unreadable
00592     // if we try to avoid this in the one case, so do it in every case.
00593     size += SM_PAGESIZE;
00594 
00595     align_bufsize(size, system_page_size, max_page_size);
00596 
00597     // ***********************************************************
00598     //
00599     // MMAP: get a mapped region from the kernel.
00600     //
00601     // ***********************************************************
00602 
00603     w_assert1(_disk_buffer == NULL);
00604 
00605     errno = 0;
00606     // mmap ( 0, length, protection, flags, fd, 0)
00607     _disk_buffer = (char*) mmap(0, _disk_buffer_size,
00608                (PROT_READ | PROT_WRITE), /* prot */
00609                flags,
00610                fd,   /* fd */
00611                0     /* off_t */
00612                );
00613 
00614     if (_disk_buffer == MAP_FAILED) {
00615         cerr 
00616             << __LINE__ << " " 
00617             << "mmap (size=" << _disk_buffer_size << ") returns "
00618             <<  long(_disk_buffer)
00619             << " errno is " <<  errno  << " " << strerror(errno)
00620             << " prot " <<  (PROT_READ | PROT_WRITE)
00621             << " flags " <<  flags
00622             << " fd " <<  fd  
00623             << endl;
00624         close(fd); 
00625         return RC(fcMMAPFAILED);
00626     }
00627 #if W_DEBUG_LEVEL > 4
00628     else
00629     {
00630         fprintf(stderr, 
00631     "%d mmap SUCCESS! (size= %lu, %lu KB) returns %p errno %d/%s prot 0x%x flags 0x%x fd %d\n",
00632         __LINE__, 
00633         _disk_buffer_size,
00634         _disk_buffer_size/1024,
00635         _disk_buffer, errno, strerror(errno), 
00636         (PROT_READ | PROT_WRITE),
00637         flags, fd);
00638         fprintf(stderr, 
00639     "%d mmap (size= %lu, %lu KB) (requested_size %d, %d KB) buf-requested is %d\n",
00640         __LINE__,
00641         _disk_buffer_size,
00642         _disk_buffer_size/1024,
00643         int(requested_size),
00644         int(requested_size/1024),
00645         int(_disk_buffer_size-requested_size) );
00646 
00647 
00648     }
00649 #endif
00650 
00651     align_for_sm(requested_size);
00652     buf_start = _disk_buffer;
00653     clear(buf_start, requested_size);
00654     return RCOK;
00655 }
00656 #endif
00657 
00658 /********************************************************************
00659 
00660 NOTES: HUGETLBFS: To minimize tlb misses:
00661 
00662 Make sure the region uses the largest page size available so that it
00663 will require the fewest tlb entries possible.
00664 
00665 If we have hugetlbfs, use the given path and get an fd for it. (This requires
00666 a shore config argument -- HUGETLBFS_PATH, set in shore.def).
00667 
00668 If not, do the following:
00669 1) mmap a region with PROT_NONE & MAP_PRIVATE, MAP_NORESERVE
00670     (if present)
00671 2) remap in chunks of max page size.
00672 2-a) If we have memcntl, use it to request the largest page size to be used
00673 2-b) re-map sections using largest page size available
00674     with the protection PROT_READ | PROT_WRITE, 
00675     and flags MAP_FIXED 
00676 
00677 To find out the max page size available:
00678 
00679 If we have a hugetlbfs (as in, on linux 2.6), we are stuck with what
00680 it gives us.
00681 
00682 If not, and we have getpagesizes() use it to
00683 get the largest page size we have on this machine,
00684 else 
00685 use sysconf(_SC_PHYS_PAGES).
00686 
00687 For whatever page size we come up with, make sure the size we request is 
00688 a multiple of the system pages size and of the page size we are trying
00689 to use and the SM page size.
00690 
00691 The resulting buffer address must be aligned wrt the SM page size as
00692 well as the page size we are trying to use.  To ensure that: 
00693 if the system doesn't give us a MAP_ALIGN option, we'll 
00694 add one SM_PAGESIZE to the requested size and and then do the
00695 alignment ourselves at the end.
00696 
00697 To make sure the region we get is continguous: Assume mmap does it right
00698 on any given call; use MAP_FIXED when remapping.
00699 
00700 Finally, if the use configured with --without-mmap, then bypass all this
00701 and just alloc the memory using posix_memalign, memalign,or valloc.
00702 
00703 ********************************************************************/
00704 
00705 w_rc_t 
00706 sthread_t::set_bufsize(size_t size, char *&buf_start /* in/out*/,
00707     bool 
00708 #if defined(HAVE_HUGETLBFS)
00709     // This argument is used only by the unit tests.
00710     use_normal_if_huge_fails /*=false*/
00711 #endif
00712     )
00713 {
00714     if (_disk_buffer && size == 0) {
00715         do_unmap();
00716         return RCOK;
00717     }
00718 
00719     if (_disk_buffer) {
00720         cerr << "Can't re-allocate disk buffer without disabling"
00721             << endl;
00722         return RC(fcINTERNAL);
00723     }
00724 
00725     buf_start = 0;
00726 
00727     long system_page_size = sysconf(_SC_PAGESIZE);
00728 
00729 #ifdef WITHOUT_MMAP
00730     // If the user configured --without-mmap, then don't even 
00731     // bother with the mmap attempts below.
00732     return set_bufsize_memalign(size, buf_start, system_page_size);
00733 #endif
00734 
00735 #if defined(HAVE_HUGETLBFS)
00736     // Ok, we have to have configured for hugefs AND we have to
00737     // have set a path for it.  If we have no path string,
00738     // we have chosen not to use hugetlbfs.  This is the result
00739     // of setting run-time options sm_hugetlbfs_path to "NULL".
00740     // So if we've set the path to "NULL", we will just use the 
00741     // "normal way".
00742     if(hugefs_path != NULL) {
00743         w_rc_t rc =  set_bufsize_huge(size, buf_start, system_page_size);
00744         if( !rc.is_error() ) {
00745 #if W_DEBUG_LEVEL > 10
00746             cout << "Using hugetlbfs size " << size
00747                 << " system_page_size " << system_page_size
00748                 << " path " << hugefs_path << ". " << endl;
00749 #endif
00750             return rc;
00751         }
00752         if(!use_normal_if_huge_fails)
00753         {
00754             return rc;
00755         }
00756         // else, try the other way
00757         cerr << "Skipping hugetlbfs sue to mmap failure: " << rc << endl;
00758     } else {
00759         cout << "Skipping hugetlbfs based on user option. " << endl;
00760     }
00761 #endif
00762     return set_bufsize_normal(size, buf_start, system_page_size);
00763 }
00764 
00765 
00766 char  *
00767 sthread_t::set_bufsize(size_t size)
00768 {
00769     w_rc_t    e;
00770     char    *start;
00771 
00772     if(size==0) { do_unmap(); return NULL; }
00773 
00774     e = set_bufsize(size, start);
00775 
00776     if (e.is_error()) {
00777         cerr << "Hidden Failure: set_bufsize(" << size << "):"
00778             << endl << e << endl;
00779         return 0;
00780     }
00781 
00782     /* compatability on free */
00783     if (size == 0)
00784         start = 0;
00785 
00786     return start;
00787 }
00788 
00789 
00790 w_rc_t
00791 sthread_t::open(const char* path, int flags, int mode, int &ret_fd)
00792 {
00793     w_rc_t    e;
00794     sdisk_t    *dp;
00795 
00796     /* default return value */
00797     ret_fd = -1;
00798 
00799     bool    open_local = true;
00800 
00801     CRITICAL_SECTION(cs, protectFDs);
00802 
00803     if (open_count >= open_max) {
00804         // This was originally done because when we used a separate
00805         // process for blocking i/o, we could
00806         // have many more open files than sthread_t::open_max.  
00807         // But with threading, we are stuck with the the os limit O(1024). 
00808         // For now, we use the original code because open_max starts out 0.
00809         // TODO : We need to use os limit here, acquire the array once.  
00810         // I suppose it's worth doing this dynamically for several reasons.
00811         // Not all threads do I/O, for one thing.
00812         //
00813         /* reallocate file table */
00814         unsigned    new_max = open_max + 64;
00815         sdisk_t    **new_disks = new sdisk_t *[new_max];
00816         /* XXX could generate chained error or duplicate existing */
00817         if (!new_disks) {
00818             return RC(fcOUTOFMEMORY);
00819         }
00820         unsigned    disk;
00821         for (disk = 0; disk < open_count; disk++)
00822             new_disks[disk] = _disks[disk];
00823         for (; disk < new_max; disk++)
00824             new_disks[disk] = 0;
00825         sdisk_t    **tmp = _disks;
00826         _disks = new_disks;
00827         open_max = new_max;
00828         delete [] tmp;
00829     }
00830 
00831     /* XXX incredibly slow when #fds large */
00832     unsigned    disk;
00833     for (disk = 0; disk < open_max; disk++)
00834         if (!_disks[disk])
00835             break;
00836     if (disk == open_max) {
00837         return RC(stINTERNAL);    /* XXX or toomanyopen */
00838     }
00839 
00840     /* XXX can allow sim. open by locking lower levels, put dummy
00841         pointer in array, unlocking here, opening, etc */
00842 
00843     if (open_local) {
00844         e = sdisk_unix_t::make(path, flags, mode, dp);
00845     }
00846 
00847 
00848     if (e.is_error()) {
00849         return e;
00850     }
00851 
00852     _disks[disk] = dp;
00853     open_count++;
00854 
00855     ret_fd = fd_base + disk;
00856     
00857     return RCOK;
00858 }
00859 
00860 
00861 
00862 /*
00863  *  sthread_t::close(fd)
00864  *
00865  *  Close a file previously opened with sthread_t::open(). 
00866  */
00867 
00868 w_rc_t sthread_t::close(int fd)
00869 {
00870     fd -= fd_base;
00871     if (fd < 0 || fd >= (int)open_max || !_disks[fd])
00872         return RC(stBADFD);
00873 
00874     w_rc_t    e;
00875 
00876     // sync before close
00877     e = _disks[fd]->sync();
00878     if (e.is_error())
00879         return e;
00880 
00881     e = _disks[fd]->close();
00882     if (e.is_error())
00883         return e;
00884 
00885     sdisk_t    *togo;
00886     {
00887         CRITICAL_SECTION(cs, protectFDs);
00888         togo = _disks[fd];
00889         _disks[fd] = 0;
00890         open_count--;
00891     }
00892     delete togo;
00893     
00894     return e;
00895 }
00896 
00897 /*
00898  *  sthread_t::write(fd, buf, n)
00899  *  sthread_t::writev(fd, iov, iovcnt)
00900  *  sthread_t::read(fd, buf, n)
00901  *  sthread_t::readv(fd, iov, iovcnt)
00902  *  sthread_t::fsync(fd)
00903  *  sthread_t::ftruncate(fd, len)
00904  *
00905  *  Perform I/O.
00906  *
00907  *  XXX Currently I/O operations that don't have a complete character
00908  *  count return with a "SHORTIO" error.  In the future,
00909  *  there should be two forms of read and
00910  *  write operations.  The current style which returns errors
00911  *  on "Short I/O", and a new version which can return a character
00912  *  count, or "Short I/O" if a character count can't be
00913  *  determined.
00914  *
00915  *  XXX various un-const casts are included below.  Some of them
00916  *  will be undone when cleanup hits.  Others should be
00917  *  propogated outward to the method declarations in sthread.h
00918  *  to match what the underlying OSs may guarantee.
00919  */
00920 
00921 w_rc_t    sthread_t::read(int fd, void* buf, int n)
00922 {
00923     fd -= fd_base;
00924     if (fd < 0 || fd >= (int)open_max || !_disks[fd]) 
00925         return RC(stBADFD);
00926 
00927     int    done = 0;
00928     w_rc_t    e;
00929 
00930     e = _disks[fd]->read(buf, n, done);
00931     if (!e.is_error() && done != n)
00932         e = RC(stSHORTIO);
00933 
00934     return e;
00935 }
00936 
00937 
00938 w_rc_t    sthread_t::write(int fd, const void* buf, int n)
00939 {
00940     fd -= fd_base;
00941     if (fd < 0 || fd >= (int)open_max || !_disks[fd]) 
00942         return RC(stBADFD);
00943 
00944     int    done = 0;
00945     w_rc_t    e;
00946 
00947     e = _disks[fd]->write(buf, n, done);
00948     if (!e.is_error() && done != n)
00949         e = RC(stSHORTIO);
00950 
00951     return e;
00952 }
00953 
00954 
00955 w_rc_t    sthread_t::readv(int fd, const iovec_t *iov, size_t iovcnt)
00956 {
00957     fd -= fd_base;
00958     if (fd < 0 || fd >= (int)open_max || !_disks[fd]) 
00959         return RC(stBADFD);
00960 
00961     int    done = 0;
00962     int    total = 0;
00963     w_rc_t    e;
00964 
00965     total = sdisk_t::vsize(iov, iovcnt);
00966 
00967     e = _disks[fd]->readv(iov, iovcnt, done);
00968     if (!e.is_error() && done != total)
00969         e = RC(stSHORTIO);
00970 
00971     return e;
00972 }
00973 
00974 
00975 w_rc_t    sthread_t::writev(int fd, const iovec_t *iov, size_t iovcnt)
00976 {
00977     fd -= fd_base;
00978     if (fd < 0 || fd >= (int)open_max || !_disks[fd]) 
00979         return RC(stBADFD);
00980 
00981     int    done = 0;
00982     int    total = 0;
00983     w_rc_t    e;
00984 
00985     total = sdisk_t::vsize(iov, iovcnt);
00986 
00987     e = _disks[fd]->writev(iov, iovcnt, done);
00988     if (!e.is_error() && done != total)
00989         e = RC(stSHORTIO);
00990 
00991     return e;
00992 }
00993 
00994 
00995 w_rc_t    sthread_t::pread(int fd, void *buf, int n, fileoff_t pos)
00996 {
00997     fd -= fd_base;
00998     if (fd < 0 || fd >= (int)open_max || !_disks[fd]) 
00999         return RC(stBADFD);
01000 
01001     int    done = 0;
01002     w_rc_t    e;
01003 
01004     errno = 0;
01005     e = _disks[fd]->pread(buf, n, pos, done);
01006     if (!e.is_error() && done != n) {
01007         e = RC2(stSHORTIO, done);
01008     }
01009 
01010     return e;
01011 }
01012 
01013 
01014 w_rc_t    sthread_t::pwrite(int fd, const void *buf, int n, fileoff_t pos)
01015 {
01016     fd -= fd_base;
01017     if (fd < 0 || fd >= (int)open_max || !_disks[fd]) 
01018         return RC(stBADFD);
01019 
01020     int    done = 0;
01021     w_rc_t    e;
01022 
01023     e = _disks[fd]->pwrite(buf, n, pos, done);
01024     if (!e.is_error() && done != n)
01025         e = RC(stSHORTIO);
01026 
01027     return e;
01028 }
01029 
01030 
01031 w_rc_t    sthread_t::fsync(int fd)
01032 {
01033     fd -= fd_base;
01034     if (fd < 0 || fd >= (int)open_max || !_disks[fd]) 
01035         return RC(stBADFD);
01036 
01037     w_rc_t        e;
01038     e = _disks[fd]->sync();
01039 
01040     return e;
01041 }
01042 
01043 w_rc_t    sthread_t::ftruncate(int fd, fileoff_t n)
01044 {
01045     fd -= fd_base;
01046     if (fd < 0 || fd >= (int)open_max || !_disks[fd]) 
01047         return RC(stBADFD);
01048 
01049     w_rc_t        e;
01050     e =  _disks[fd]->truncate(n);
01051 
01052     return e;
01053 }
01054 
01055 
01056 w_rc_t sthread_t::lseek(int fd, fileoff_t pos, int whence, fileoff_t& ret)
01057 {
01058     fd -= fd_base;
01059     if (fd < 0 || fd >= (int)open_max || !_disks[fd]) 
01060         return RC(stBADFD);
01061 
01062     w_rc_t    e;
01063 
01064     e = _disks[fd]->seek(pos, whence, ret);
01065 
01066     return e;
01067 }
01068 
01069 
01070 w_rc_t sthread_t::lseek(int fd, fileoff_t offset, int whence)
01071 {
01072     fileoff_t    dest;
01073     w_rc_t        e;
01074 
01075     e = sthread_t::lseek(fd, offset, whence, dest);
01076     if (!e.is_error() && whence == SEEK_AT_SET && dest != offset)
01077         e = RC(stSHORTSEEK);
01078 
01079     return e;
01080 }
01081 
01082 
01083 w_rc_t    sthread_t::fstat(int fd, filestat_t &st)
01084 {
01085     fd -= fd_base;
01086     if (fd < 0 || fd >= (int)open_max || !_disks[fd]) 
01087         return RC(stBADFD);
01088 
01089     w_rc_t    e;
01090 
01091     e = _disks[fd]->stat(st);
01092 
01093     return e;
01094 }
01095 
01096 w_rc_t    sthread_t::fisraw(int fd, bool &isRaw)
01097 {
01098     filestat_t    st;
01099 
01100     isRaw = false;        /* default value */
01101 
01102     W_DO(fstat(fd, st));    /* takes care of errors */
01103 
01104     isRaw = st.is_device ;
01105     return RCOK;
01106 }
01107 
01108 
01109 void    sthread_t::dump_io(ostream &s)
01110 {
01111     s << "I/O:";
01112     s << " open_max=" << int(open_max);
01113     s << " open_count=" << open_count;
01114     s << endl;
01115 }
01116 
01117 extern "C" void dump_io() 
01118 {
01119     sthread_t::dump_io(cout);
01120     cout << flush;
01121 }
01122 /**\endcond skip */

Generated on Mon Jan 2 15:13:57 2012 for Shore Storage Manager by  doxygen 1.4.7