00001 #ifndef MP_MODEXP
00002 #define MP_MODEXP
00003
00004 #include <stdint.h>
00005
00006 #include <openssl/bn.h>
00007
00008 #include <cuda_runtime.h>
00009 #include <cutil_inline.h>
00010
00011 #define MAX_STREAMS 16
00012 #define MP_MAX_NUM_PAIRS 1024
00013
00014 #if MP_USE_64BIT == 1
00015
00016 #define BITS_PER_WORD 64
00017 typedef uint64_t WORD;
00018
00019
00020 #define MAX_S 32
00021 #define S_256 4
00022 #define S_512 8
00023 #define S_1024 16
00024 #define S_2048 32
00025
00026 #define MP_MSGS_PER_BLOCK (16 / (S / S_256))
00027
00028 #elif MP_USE_64BIT == 0
00029
00030 #define BITS_PER_WORD 32
00031 typedef uint32_t WORD;
00032
00033
00034 #define MAX_S 64
00035 #define S_256 8
00036 #define S_512 16
00037 #define S_1024 32
00038 #define S_2048 64
00039
00040 #define MP_MSGS_PER_BLOCK (8 / (S / S_256))
00041
00042 #else
00043
00044 #error MP_USE_64BIT is not defined
00045
00046 #endif
00047
00048
00049 #define MP_MODEXP_OFFLOAD_POST 1
00050
00051
00052 #define MONTMUL_FAST_CPU 1
00053 #define MONTMUL_FAST_GPU 1
00054
00055 #ifdef __GPU__
00056 #define sync_if_needed() do { if (S > 32) __syncthreads(); } while(0)
00057 #else
00058 #define sync_if_needed()
00059 #endif
00060
00061
00062
00063
00064 #define ADD_CARRY(c, s, x, y) \
00065 do { \
00066 WORD _t = (x) + (y); \
00067 (c) += (_t < (x)); \
00068 sync_if_needed(); \
00069 (s) = _t; \
00070 sync_if_needed(); \
00071 } while (0)
00072
00073
00074 #define ADD_CARRY_CLEAR(c, s, x, y) \
00075 do { \
00076 WORD _t = (x) + (y); \
00077 (y) = 0; \
00078 sync_if_needed(); \
00079 (c) += (_t < (x)); \
00080 (s) = _t; \
00081 sync_if_needed(); \
00082 } while (0)
00083
00084
00085
00086
00087 #define SUB_BORROW(b, d, x, y) \
00088 do { \
00089 WORD _t = (x) - (y); \
00090 (b) += (_t > (x)); \
00091 sync_if_needed(); \
00092 (d) = _t; \
00093 sync_if_needed(); \
00094 } while (0)
00095
00096
00097 #define SUB_BORROW_CLEAR(b, d, x, y) \
00098 do { \
00099 WORD _t = (x) - (y); \
00100 (y) = 0; \
00101 sync_if_needed(); \
00102 (b) += (_t > (x)); \
00103 (d) = _t; \
00104 sync_if_needed(); \
00105 } while (0)
00106
00107 #define MP_USE_CLNW 1
00108 #define MP_USE_VLNW 0
00109
00110 #if MP_USE_CLNW + MP_USE_VLNW != 1
00111 #error Use one and only one sliding window technique
00112 #endif
00113
00114 #define MP_SW_MAX_NUM_FRAGS 512
00115 #define MP_SW_MAX_FRAGMENT 128
00116
00117
00118 struct mp_sw {
00119 uint16_t fragment[MP_SW_MAX_NUM_FRAGS];
00120 uint16_t length[MP_SW_MAX_NUM_FRAGS];
00121 int num_fragments;
00122 int max_fragment;
00123 };
00124
00125 void mp_print(const char *name, const WORD *a, int word_len = MAX_S);
00126 void mp_bn2mp(WORD *a, const BIGNUM *bn, int word_len = MAX_S);
00127 void mp_mp2bn(BIGNUM *bn, const WORD *a, int word_len = MAX_S);
00128 void mp_copy(WORD *dst, const WORD *org, int word_len = MAX_S);
00129 void mp_get_sw(struct mp_sw *ret, const WORD *a, int word_len = MAX_S);
00130
00131 void mp_modexp_crt(WORD *a,
00132 int cnt, int S,
00133 WORD *ret_d, WORD *ar_d,
00134 struct mp_sw *sw_d,
00135 WORD *n_d, WORD *np_d, WORD *r_sqr_d,
00136 cudaStream_t stream,
00137 unsigned int stream_id,
00138 uint8_t *checkbits = 0);
00139
00140 int mp_modexp_crt_sync(WORD *ret, WORD *ret_d,
00141 WORD *n_d, WORD *np_d, WORD *r_sqr_d, WORD *iqmp_d,
00142 int cnt, int S,
00143 bool block, cudaStream_t stream,
00144 uint8_t *checkbits = 0);
00145
00146 int mp_modexp_crt_post_kernel(WORD *ret, WORD *ret_d, WORD *n_d, WORD *np_d, WORD *r_sqr_d, WORD *iqmp_d,
00147 int cnt, int S,
00148 bool block, cudaStream_t stream,
00149 uint8_t *checkbits = 0);
00150
00151
00152
00153 void mp_test_cpu();
00154 void mp_test_gpu();
00155
00156
00157 void mp_mul_cpu(WORD *ret, const WORD *a, const WORD *b);
00158 int mp_add_cpu(WORD *ret, const WORD *x, const WORD *y);
00159 int mp_add1_cpu(WORD *ret, const WORD *x);
00160 int mp_sub_cpu(WORD *ret, const WORD *x, const WORD *y);
00161 void mp_montmul_cpu(WORD *ret, const WORD *a, const WORD *b,
00162 const WORD *n, const WORD *np);
00163 void mp_modexp_cpu(WORD *ret, const WORD *ar, const WORD *e,
00164 const WORD *n, const WORD *np);
00165
00166 void mp_mul_gpu(WORD *ret, const WORD *x, const WORD *y);
00167 void mp_add_gpu(WORD *ret, const WORD *x, const WORD *y);
00168 void mp_add1_gpu(WORD *ret, const WORD *x);
00169 void mp_sub_gpu(WORD *ret, const WORD *x, const WORD *y);
00170 void mp_montmul_gpu(WORD *ret, const WORD *a, const WORD *b,
00171 const WORD *n, const WORD *np);
00172 void mp_modexp_gpu(WORD *ret, const WORD *ar, const WORD *e,
00173 const WORD *n, const WORD *np);
00174
00175 #endif