00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00039 #ifndef _PCM_INTERNAL_H
00040 #define _PCM_INTERNAL_H
00041
00042 #include <stdint.h>
00043 #include <mmintrin.h>
00044 #include <list.h>
00045 #include <spinlock.h>
00046 #include "cuckoo_hash/PointerHashInline.h"
00047
00048
00049 #ifdef __cplusplus
00050 extern "C" {
00051 #endif
00052
00053
00063
00064 #undef M_PCM_EMULATE_LATENCY_BLOCKING_STORES
00065
00066
00075
00076 #undef HAS_RDTSCP
00077
00079 #define WRITE_COMBINING_BUFFERS_NUM 8
00080
00082 #define WCBUF_HASHTBL_SIZE WRITE_COMBINING_BUFFERS_NUM*4
00083
00085 #define MEMORY_BANKING_FACTOR 8
00086
00087
00088
00089
00090
00091
00092 #define TOTAL_OUTCOMES_NUM 1000000
00093
00094 #if (RAND_MAX < TOTAL_OUTCOMES_NUM)
00095 # error "RAND_MAX must be at least equal to PROB_TOTAL_OUTCOMES_NUM."
00096 #endif
00097
00098
00099 #define NS2CYCLE(__ns) ((__ns) * M_PCM_CPUFREQ / 1000)
00100 #define CYCLE2NS(__cycles) ((__cycles) * 1000 / M_PCM_CPUFREQ)
00101
00102
00103 #define likely(x) __builtin_expect(!!(x), 1)
00104 #define unlikely(x) __builtin_expect(!!(x), 0)
00105
00106
00107
00108
00109 #define PAGE_SIZE 4096
00110
00111
00112 #define NUM_PAGES(size) ((((size) % PAGE_SIZE) == 0? 0 : 1) + (size)/PAGE_SIZE)
00113
00114
00115 #define SIZEOF_PAGES(size) (NUM_PAGES((size)) * PAGE_SIZE)
00116
00117
00118 #define PAGE_ALIGN(addr) (NUM_PAGES((addr)) * PAGE_SIZE)
00119
00120
00121
00122
00123 #ifdef __x86_64__
00124 # define CACHELINE_SIZE 64
00125 # define CACHELINE_SIZE_LOG 6
00126 #else
00127 # define CACHELINE_SIZE 32
00128 # define CACHELINE_SIZE_LOG 5
00129 #endif
00130
00131 #define BLOCK_ADDR(addr) ( (pcm_word_t *) (((pcm_word_t) (addr)) & ~(CACHELINE_SIZE - 1)) )
00132 #define INDEX_ADDR(addr) ( (pcm_word_t *) (((pcm_word_t) (addr)) & (CACHELINE_SIZE - 1)) )
00133
00134
00135
00136
00137 typedef uintptr_t pcm_word_t;
00138
00139 typedef uint64_t pcm_hrtime_t;
00140
00141 typedef struct pcm_storeset_s pcm_storeset_t;
00142 typedef struct cacheline_tbl_s cacheline_tbl_t;
00143
00144
00145
00150 struct pcm_storeset_s {
00151 uint32_t id;
00152 uint32_t state;
00153 unsigned int rand_seed;
00154 PointerHash *hashtbl;
00155 uint16_t wcbuf_hashtbl[WCBUF_HASHTBL_SIZE];
00156 uint16_t wcbuf_hashtbl_count;
00157 uint32_t seqstream_len;
00158 cacheline_tbl_t *cacheline_tbl;
00159 struct list_head list;
00160 volatile unsigned int in_crash_emulation_code;
00161 uint64_t seqstream_write_TS_array[8];
00162 int seqstream_write_TS_index;
00163 };
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173 extern unsigned int pcm_likelihood_store_blockwaits;
00174 extern volatile arch_spinlock_t ticket_lock;
00175
00176
00177
00178
00179
00180 int pcm_storeset_create(pcm_storeset_t **setp);
00181 void pcm_storeset_destroy(pcm_storeset_t *set);
00182 pcm_storeset_t* pcm_storeset_get(void);
00183 void pcm_storeset_put(void);
00184 void pcm_wb_store_emulate_crash(pcm_storeset_t *set, volatile pcm_word_t *addr, pcm_word_t val);
00185 void pcm_wb_flush_emulate_crash(pcm_storeset_t *set, volatile pcm_word_t *addr);
00186 void pcm_nt_store_emulate_crash(pcm_storeset_t *set, volatile pcm_word_t *addr, pcm_word_t val);
00187 void pcm_nt_flush_emulate_crash(pcm_storeset_t *set);
00188
00189
00190
00191
00192
00193
00194 static inline void asm_cpuid() {
00195 asm volatile( "cpuid" :::"rax", "rbx", "rcx", "rdx");
00196 }
00197
00198 #if defined(__i386__)
00199
00200 static inline unsigned long long asm_rdtsc(void)
00201 {
00202 unsigned long long int x;
00203 __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
00204 return x;
00205 }
00206
00207 static inline unsigned long long asm_rdtscp(void)
00208 {
00209 unsigned hi, lo;
00210 __asm__ __volatile__ ("rdtscp" : "=a"(lo), "=d"(hi)::"ecx");
00211 return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
00212
00213 }
00214 #elif defined(__x86_64__)
00215
00216 static inline unsigned long long asm_rdtsc(void)
00217 {
00218 unsigned hi, lo;
00219 __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
00220 return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
00221 }
00222
00223 static inline unsigned long long asm_rdtscp(void)
00224 {
00225 unsigned hi, lo;
00226 __asm__ __volatile__ ("rdtscp" : "=a"(lo), "=d"(hi)::"rcx");
00227 return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
00228 }
00229 #else
00230 #error "What architecture is this???"
00231 #endif
00232
00233
00234 static inline void asm_sse_write_block64(volatile pcm_word_t *addr, pcm_word_t *val)
00235 {
00236 __asm__ __volatile__ ("movnti %1, %0" : "=m"(*&addr[0]): "r" (val[0]));
00237 __asm__ __volatile__ ("movnti %1, %0" : "=m"(*&addr[1]): "r" (val[1]));
00238 __asm__ __volatile__ ("movnti %1, %0" : "=m"(*&addr[2]): "r" (val[2]));
00239 __asm__ __volatile__ ("movnti %1, %0" : "=m"(*&addr[3]): "r" (val[3]));
00240 __asm__ __volatile__ ("movnti %1, %0" : "=m"(*&addr[4]): "r" (val[4]));
00241 __asm__ __volatile__ ("movnti %1, %0" : "=m"(*&addr[5]): "r" (val[5]));
00242 __asm__ __volatile__ ("movnti %1, %0" : "=m"(*&addr[6]): "r" (val[6]));
00243 __asm__ __volatile__ ("movnti %1, %0" : "=m"(*&addr[7]): "r" (val[7]));
00244 }
00245
00246
00247 static inline void asm_movnti(volatile pcm_word_t *addr, pcm_word_t val)
00248 {
00249 __asm__ __volatile__ ("movnti %1, %0" : "=m"(*addr): "r" (val));
00250 }
00251
00252
00253 static inline void asm_clflush(volatile pcm_word_t *addr)
00254 {
00255 __asm__ __volatile__ ("clflush %0" : : "m"(*addr));
00256 }
00257
00258
00259 static inline void asm_mfence(void)
00260 {
00261 __asm__ __volatile__ ("mfence");
00262 }
00263
00264
00265 static inline void asm_sfence(void)
00266 {
00267 __asm__ __volatile__ ("sfence");
00268 }
00269
00270
00271 static inline
00272 int rand_int(unsigned int *seed)
00273 {
00274 *seed=*seed*196314165+907633515;
00275 return *seed;
00276 }
00277
00278
00279 # ifdef _EMULATE_LATENCY_USING_NOPS
00280
00281 static inline void asm_nop10() {
00282 asm volatile("nop");
00283 asm volatile("nop");
00284 asm volatile("nop");
00285 asm volatile("nop");
00286 asm volatile("nop");
00287 asm volatile("nop");
00288 asm volatile("nop");
00289 asm volatile("nop");
00290 asm volatile("nop");
00291 asm volatile("nop");
00292 }
00293
00294 static inline
00295 void
00296 emulate_latency_ns(int ns)
00297 {
00298 int i;
00299 pcm_hrtime_t cycles;
00300 pcm_hrtime_t start;
00301 pcm_hrtime_t stop;
00302
00303 cycles = NS2CYCLE(ns);
00304 for (i=0; i<cycles; i+=5) {
00305 asm_nop10();
00306 }
00307 }
00308
00309 # else
00310
00311 static inline
00312 void
00313 emulate_latency_ns(int ns)
00314 {
00315 pcm_hrtime_t cycles;
00316 pcm_hrtime_t start;
00317 pcm_hrtime_t stop;
00318
00319 start = asm_rdtsc();
00320 cycles = NS2CYCLE(ns);
00321
00322 do {
00323
00324
00325
00326
00327
00328
00329 stop = asm_rdtsc();
00330 } while (stop - start < cycles);
00331 }
00332
00333 # endif
00334
00351 static inline
00352 void
00353 write_aligned_masked(pcm_word_t *addr, pcm_word_t val, pcm_word_t mask)
00354 {
00355 uintptr_t a;
00356 int i;
00357 int trailing_0bytes;
00358 int leading_0bytes;
00359
00360 union convert_u {
00361 pcm_word_t w;
00362 uint8_t b[sizeof(pcm_word_t)];
00363 } valu;
00364
00365
00366 if (mask == ((uint64_t) -1)) {
00367 *addr = val;
00368 } else {
00369 valu.w = val;
00370 a = (uintptr_t) addr;
00371 trailing_0bytes = __builtin_ctzll(mask) >> 3;
00372 leading_0bytes = __builtin_clzll(mask) >> 3;
00373 for (i = trailing_0bytes; i<8-leading_0bytes;i++) {
00374 *((uint8_t *) (a+i)) = valu.b[i];
00375 }
00376 }
00377 }
00378
00379
00380
00381
00382
00383
00384
00393 static inline
00394 void
00395 PCM_WB_STORE(pcm_storeset_t *set, volatile pcm_word_t *addr, pcm_word_t val)
00396 {
00397
00398 #ifdef M_PCM_EMULATE_CRASH
00399 pcm_wb_store_emulate_crash(set, addr, val);
00400 #endif
00401
00402 *addr = val;
00403
00404 #ifdef M_PCM_EMULATE_LATENCY
00405 # ifdef M_PCM_EMULATE_LATENCY_BLOCKING_STORES
00406 if (pcm_likelihood_store_blockwaits > 0) {
00407 int random_number = rand_int(&set->rand_seed) % TOTAL_OUTCOMES_NUM;
00408 if (random_number < pcm_likelihood_store_blockwaits) {
00409 emulate_latency_ns(M_PCM_LATENCY_WRITE);
00410 }
00411 }
00412 # endif
00413 #endif
00414 }
00415
00416
00417 static inline
00418 void
00419 PCM_WB_STORE_MASKED(pcm_storeset_t *set,
00420 volatile pcm_word_t *addr,
00421 pcm_word_t val,
00422 pcm_word_t mask)
00423 {
00424
00425 #ifdef M_PCM_EMULATE_CRASH
00426 pcm_wb_store_emulate_crash(set, addr, val);
00427 #endif
00428
00429 write_aligned_masked((pcm_word_t *) addr, val, mask);
00430
00431 #ifdef M_PCM_EMULATE_LATENCY
00432 # ifdef M_PCM_EMULATE_LATENCY_BLOCKING_STORES
00433 if (pcm_likelihood_store_blockwaits > 0) {
00434 int random_number = rand_int(&set->rand_seed) % TOTAL_OUTCOMES_NUM;
00435 if (random_number < pcm_likelihood_store_blockwaits) {
00436 emulate_latency_ns(M_PCM_LATENCY_WRITE);
00437 }
00438 }
00439 # endif
00440 #endif
00441 }
00442
00443
00444 static inline
00445 void
00446 PCM_WB_STORE_ALIGNED_MASKED(pcm_storeset_t *set,
00447 volatile pcm_word_t *addr,
00448 pcm_word_t val,
00449 pcm_word_t mask)
00450 {
00451 PCM_WB_STORE_MASKED(set, addr, val, mask);
00452 }
00453
00454
00455 static inline
00456 void
00457 PCM_WB_FENCE(pcm_storeset_t *set)
00458 {
00459 asm_mfence();
00460 }
00461
00462
00463
00464
00465 static inline
00466 void
00467 PCM_WB_FLUSH(pcm_storeset_t *set, volatile pcm_word_t *addr)
00468 {
00469 #ifdef M_PCM_EMULATE_CRASH
00470 pcm_wb_flush_emulate_crash(set, addr);
00471 #endif
00472
00473
00474
00475
00476
00477
00478
00479
00480 #ifdef M_PCM_EMULATE_LATENCY
00481 {
00482 #ifdef HAS_RDTSCP
00483
00484
00485 pcm_hrtime_t start;
00486 pcm_hrtime_t stop;
00487
00488 start = asm_rdtscp();
00489 asm_clflush(addr);
00490 stop = asm_rdtscp();
00491 emulate_latency_ns(M_PCM_LATENCY_WRITE - CYCLE2NS(stop-start));
00492 #else
00493 asm_clflush(addr);
00494 emulate_latency_ns(M_PCM_LATENCY_WRITE);
00495 #endif
00496 asm_mfence();
00497 }
00498
00499 #else
00500 asm_clflush(addr);
00501 asm_mfence();
00502 #endif
00503
00504 }
00505
00506
00507
00508
00509
00510
00511
00512
00513 static inline
00514 void
00515 PCM_NT_STORE(pcm_storeset_t *set, volatile pcm_word_t *addr, pcm_word_t val)
00516 {
00517 #ifdef M_PCM_EMULATE_CRASH
00518 pcm_nt_store_emulate_crash(set, addr, val);
00519 #endif
00520
00521 asm_movnti(addr, val);
00522
00523 #ifdef M_PCM_EMULATE_LATENCY
00524 uint16_t i;
00525 uint16_t index_addr;
00526 uint16_t index_i;
00527 uintptr_t byte_addr;
00528 uintptr_t block_byte_addr;
00529
00530 byte_addr = (uintptr_t) addr;
00531 block_byte_addr = (uintptr_t) BLOCK_ADDR(byte_addr);
00532 index_addr = (uint16_t) ((block_byte_addr >> CACHELINE_SIZE_LOG) & ((uint16_t) (-1)));
00533
00534 retry:
00535 if (set->wcbuf_hashtbl_count < WRITE_COMBINING_BUFFERS_NUM) {
00536 for (i=0; i<WCBUF_HASHTBL_SIZE; i++) {
00537 index_i = (index_addr + i) & (WCBUF_HASHTBL_SIZE-1);
00538 if (set->wcbuf_hashtbl[index_i] == index_addr) {
00539
00540 break;
00541 } else if (set->wcbuf_hashtbl[index_i] == 0) {
00542 set->wcbuf_hashtbl[index_i] = index_addr;
00543 set->wcbuf_hashtbl_count++;
00544 break;
00545 }
00546 }
00547 } else {
00548 memset(set->wcbuf_hashtbl, 0, WCBUF_HASHTBL_SIZE);
00549 emulate_latency_ns(M_PCM_LATENCY_WRITE * set->wcbuf_hashtbl_count);
00550 set->wcbuf_hashtbl_count = 0;
00551 goto retry;
00552 }
00553
00554 #endif
00555 }
00556
00557
00558 static inline
00559 void
00560 PCM_NT_FLUSH(pcm_storeset_t *set)
00561 {
00562 #ifdef M_PCM_EMULATE_CRASH
00563 pcm_nt_flush_emulate_crash(set);
00564 #endif
00565
00566 asm_sfence();
00567 #ifdef M_PCM_EMULATE_LATENCY
00568 emulate_latency_ns(M_PCM_LATENCY_WRITE * set->wcbuf_hashtbl_count);
00569 memset(set->wcbuf_hashtbl, 0, WCBUF_HASHTBL_SIZE);
00570 set->wcbuf_hashtbl_count = 0;
00571 #endif
00572 }
00573
00574
00575
00576
00577
00578
00579
00580
00581
00582
00583
00584
00585 static inline
00586 void
00587 PCM_SEQSTREAM_INIT(pcm_storeset_t *set)
00588 {
00589 #ifdef M_PCM_EMULATE_CRASH
00590
00591 #endif
00592
00593 #ifdef M_PCM_EMULATE_LATENCY
00594 set->seqstream_len = 0;
00595 set->seqstream_write_TS_index = 0;
00596 #endif
00597 }
00598
00599
00600 static inline
00601 void
00602 PCM_SEQSTREAM_STORE(pcm_storeset_t *set, volatile pcm_word_t *addr, pcm_word_t val)
00603 {
00604
00605 #ifdef M_PCM_EMULATE_CRASH
00606 #endif
00607
00608 asm_movnti(addr, val);
00609
00610 #ifdef M_PCM_EMULATE_LATENCY
00611 set->seqstream_len = set->seqstream_len + 8;
00612
00613
00614
00615
00616
00617
00618
00619
00620
00621 set->seqstream_write_TS_array[set->seqstream_write_TS_index] = asm_rdtsc();
00622 set->seqstream_write_TS_index |= 1;
00623 #endif
00624 }
00625
00626
00627 static inline
00628 void
00629 PCM_SEQSTREAM_STORE_64B_FIRST_WORD(pcm_storeset_t *set, volatile pcm_word_t *addr, pcm_word_t val)
00630 {
00631
00632 #ifdef M_PCM_EMULATE_CRASH
00633 #endif
00634
00635 asm_movnti(addr, val);
00636
00637 #ifdef M_PCM_EMULATE_LATENCY
00638 set->seqstream_len = set->seqstream_len + 64;
00639
00640
00641
00642
00643
00644
00645
00646
00647
00648 set->seqstream_write_TS_array[set->seqstream_write_TS_index] = asm_rdtsc();
00649 set->seqstream_write_TS_index |= 1;
00650 #endif
00651 }
00652
00653
00654 static inline
00655 void
00656 PCM_SEQSTREAM_STORE_64B_NEXT_WORD(pcm_storeset_t *set, volatile pcm_word_t *addr, pcm_word_t val)
00657 {
00658
00659 #ifdef M_PCM_EMULATE_CRASH
00660 #endif
00661
00662 asm_movnti(addr, val);
00663 }
00664
00665
00666 static inline
00667 void
00668 PCM_SEQSTREAM_STORE_64B(pcm_storeset_t *set, volatile pcm_word_t *addr, pcm_word_t *val)
00669 {
00670
00671 #ifdef M_PCM_EMULATE_CRASH
00672 #endif
00673
00674 asm_sse_write_block64(addr, val);
00675
00676 #ifdef M_PCM_EMULATE_LATENCY
00677 set->seqstream_len = set->seqstream_len + 64;
00678
00679
00680
00681
00682
00683
00684
00685
00686 set->seqstream_write_TS_array[set->seqstream_write_TS_index] = asm_rdtsc();
00687 set->seqstream_write_TS_index |= 1;
00688 #endif
00689 }
00690 #define RAM_SYSTEM_PEAK_BANDWIDTH_MB 7000
00691
00692
00693 static inline
00694 void
00695 PCM_SEQSTREAM_FLUSH(pcm_storeset_t *set)
00696 {
00697 #ifdef M_PCM_EMULATE_CRASH
00698
00699 #endif
00700
00701 #ifdef M_PCM_EMULATE_LATENCY
00702 int pcm_bandwidth_MB = M_PCM_BANDWIDTH_MB;
00703 int ram_system_peak_bandwidth_MB = RAM_SYSTEM_PEAK_BANDWIDTH_MB;
00704 int size;
00705 pcm_hrtime_t handicap_latency;
00706 pcm_hrtime_t extra_latency;
00707 pcm_hrtime_t elapsed_time_ns;
00708 pcm_hrtime_t elapsed_time_cycles;
00709 pcm_hrtime_t current_TS;
00710
00711 if ((size = set->seqstream_len) > 64) {
00712 current_TS = asm_rdtsc();
00713 elapsed_time_cycles = current_TS - set->seqstream_write_TS_array[0];
00714 elapsed_time_ns = CYCLE2NS(elapsed_time_cycles);
00715
00716 handicap_latency = (int) size * (1-(float) (((float) pcm_bandwidth_MB)/1000)/(((float) ram_system_peak_bandwidth_MB)/1000))/(((float)pcm_bandwidth_MB)/1000);
00717 if (handicap_latency > elapsed_time_ns) {
00718 extra_latency = handicap_latency - elapsed_time_ns;
00719 asm_sfence();
00720 __ticket_spin_lock(&ticket_lock);
00721 emulate_latency_ns(extra_latency);
00722 __ticket_spin_unlock(&ticket_lock);
00723 asm_cpuid();
00724 } else {
00725 asm_sfence();
00726 emulate_latency_ns(M_PCM_LATENCY_WRITE);
00727 }
00728 } else {
00729 asm_sfence();
00730 emulate_latency_ns(M_PCM_LATENCY_WRITE);
00731 }
00732 set->seqstream_write_TS_index = 0;
00733 set->seqstream_len = 0;
00734 #else
00735 asm_sfence();
00736 #endif
00737 }
00738
00739 #ifdef __cplusplus
00740 }
00741 #endif
00742
00743 #endif