aes_context Class Reference

#include <aes_context.hh>

Collaboration diagram for aes_context:
Collaboration graph
[legend]

Public Member Functions

 aes_context (device_context *dev_ctx)
void ecb_128_encrypt (const void *memory_start, const unsigned long in_pos, const unsigned long keys_pos, const unsigned long pkt_index_pos, const unsigned long data_size, unsigned char *out, const unsigned long block_count, const unsigned long num_flows, unsigned int stream_id)
void ecb_128_encrypt_nocopy (const void *memory_start, const unsigned long in_pos, const unsigned long keys_pos, const unsigned long pkt_index_pos, const unsigned long data_size, unsigned char *out, const unsigned long block_count, const unsigned long num_flows, unsigned int stream_id)
void cbc_encrypt (const void *memory_start, const unsigned long in_pos, const unsigned long keys_pos, const unsigned long ivs_pos, const unsigned long pkt_offset_pos, const unsigned long tot_in_len, unsigned char *out, const unsigned long num_flows, const unsigned long tot_out_len, const unsigned int stream_id, const unsigned int bits=128)
void cbc_decrypt (const void *memory_start, const unsigned long in_pos, const unsigned long keys_pos, const unsigned long ivs_pos, const unsigned long pkt_index_pos, const unsigned long data_size, unsigned char *out, const unsigned long block_count, const unsigned long num_flows, unsigned int stream_id, const unsigned int bits=128)
bool sync (const unsigned int stream_id, const bool block=true, const bool copy_result=true)

Detailed Description

class aes_context

Interface for AES cipher in GPU. It supports 128-bit CBC-mode encryption/decryptiion, ecb 128 bit encryption.


Constructor & Destructor Documentation

aes_context::aes_context ( device_context dev_ctx  ) 

Constructior.

Parameters:
dev_ctx Device context pointer. Device context must be initialized before calling this function.

00011 {
00012         dev_ctx_ = dev_ctx;
00013         for (unsigned i = 0; i <= MAX_STREAM; i++) {
00014                 streams[i].out = 0;
00015                 streams[i].out_d = 0;
00016                 streams[i].out_len = 0;
00017         }
00018 }


Member Function Documentation

void aes_context::cbc_decrypt ( const void *  memory_start,
const unsigned long  in_pos,
const unsigned long  keys_pos,
const unsigned long  ivs_pos,
const unsigned long  pkt_index_pos,
const unsigned long  data_size,
unsigned char *  out,
const unsigned long  block_count,
const unsigned long  num_flows,
unsigned int  stream_id,
const unsigned int  bits = 128 
)

It executes AES-CBC encryption in GPU.

It only supports 128-bit key length at the moment.

If stream is enabled it will run in non-blocking mode, if not, it will run in blocking mode and the result will be written back to out at the end of function call. This function takes one or more plaintext and returns ciphertext for all of them.

Parameters:
memory_start Starting point of input data.
in_pos Offset of plain texts. All plain texts are gathered in to one big buffer with 16-byte align which is AES block size.
keys_pos Offset of region that stores keys.
ivs_pos Offset of region that store IVs.
pkt_index_pos Offset of region that stores per plain text index for every block. It is used to locate key an IV.
data_size Total amount of input data.
out Buffer to store output.
block_count Total number of blocks in all plain texts.
num_flows Total number of plain text.
stream_id Stream index.
bits key length for AES cipher

00270 {
00271         assert(bits==128);
00272         assert(dev_ctx_->get_state(stream_id) == READY);
00273         dev_ctx_->set_state(stream_id, WAIT_KERNEL);
00274 
00275         unsigned long total_len = block_count * AES_BLOCK_SIZE;
00276         uint8_t *in_d;
00277         uint8_t *keys_d;
00278         uint8_t *ivs_d;
00279         uint16_t *pkt_index_d;
00280         void *memory_d;
00281         unsigned int threads_per_blk = 512;
00282         int num_blks = (block_count + threads_per_blk - 1) / threads_per_blk;
00283 
00284         //transfor encrypt key to decrypt key
00285         //basically it generates round key and take the last
00286         //CUDA code will generate the round key in reverse order
00287         for (unsigned i = 0; i < num_flows; i++) {
00288                 uint8_t *key = (uint8_t *)memory_start + keys_pos + i * bits / 8;
00289                 AES_decrypt_key_prepare(key, key, bits);
00290         }
00291 
00292         dev_ctx_->clear_checkbits(stream_id, num_blks);
00293 
00294         cuda_mem_pool *pool = dev_ctx_->get_cuda_mem_pool(stream_id);
00295         memory_d = pool->alloc(data_size);
00296         cudaMemcpyAsync(memory_d,
00297                         memory_start,
00298                         data_size,
00299                         cudaMemcpyHostToDevice,
00300                         dev_ctx_->get_stream(stream_id));
00301 
00302         streams[stream_id].out_d = (uint8_t*)pool->alloc(total_len);
00303 
00304         in_d        = (uint8_t *) memory_d + in_pos;
00305         keys_d      = (uint8_t *) memory_d + keys_pos;
00306         ivs_d       = (uint8_t *) memory_d + ivs_pos;
00307         pkt_index_d = (uint16_t *) ((uint8_t *) memory_d + pkt_index_pos);
00308 
00309         assert(bits == 128);
00310         if (dev_ctx_->use_stream() && stream_id > 0) {
00311                 AES_cbc_128_decrypt_gpu(in_d,
00312                                         streams[stream_id].out_d,
00313                                         keys_d,
00314                                         ivs_d,
00315                                         pkt_index_d,
00316                                         block_count,
00317                                         dev_ctx_->get_dev_checkbits(stream_id),
00318                                         threads_per_blk,
00319                                         dev_ctx_->get_stream(stream_id));
00320         } else if (stream_id == 0) {
00321                 AES_cbc_128_decrypt_gpu(in_d,
00322                                         streams[stream_id].out_d,
00323                                         keys_d,
00324                                         ivs_d,
00325                                         pkt_index_d,
00326                                         block_count,
00327                                         dev_ctx_->get_dev_checkbits(stream_id),
00328                                         threads_per_blk);
00329         } else {
00330                 assert(0);
00331         }
00332 
00333         assert(cudaGetLastError() == cudaSuccess);
00334 
00335         streams[stream_id].out     = out;
00336         streams[stream_id].out_len = total_len;
00337 
00338         if (!dev_ctx_->use_stream()) {
00339                 sync(stream_id, true);
00340         }
00341 }

Here is the call graph for this function:

void aes_context::cbc_encrypt ( const void *  memory_start,
const unsigned long  in_pos,
const unsigned long  keys_pos,
const unsigned long  ivs_pos,
const unsigned long  pkt_offset_pos,
const unsigned long  tot_in_len,
unsigned char *  out,
const unsigned long  num_flows,
const unsigned long  tot_out_len,
const unsigned int  stream_id,
const unsigned int  bits = 128 
)

It executes AES-CBC encryption in GPU.

It only supports 128-bit key length at the moment.

If stream is enabled it will run in non-blocking mode, if not, it will run in blocking mode and the result will be written back to out at the end of function call. This function takes one or more plaintext and returns ciphertext for all of them.

Parameters:
memory_start Starting point of input data.
in_pos Offset of plain texts. All plain texts are gathered in to one big buffer with 16-byte align which is AES block size.
keys_pos Offset of region that stores keys.
ivs_pos Offset of region that store IVs.
pkt_offset_pos Offset of region that stores position of the beginning of each plain text.
tot_in_len Total amount of input data plus meta data.
out Buffer to store output.
block_count Total number of blocks in all plain texts.
num_flows Total number of plain text.
tot_out_len Total amount of output length.
stream_id Stream index.
bits key length for AES cipher

00034 {
00035         assert(bits == 128);
00036         assert(dev_ctx_->get_state(stream_id) == READY);
00037         dev_ctx_->set_state(stream_id, WAIT_KERNEL);
00038 
00039         /* Allocate memory on device */
00040         uint8_t *in_d;
00041         uint8_t *keys_d;
00042         uint8_t *ivs_d;
00043         uint32_t *pkt_offset_d;
00044         void *memory_d;
00045 
00046         //Calculate # of cuda blks required
00047         unsigned int  threads_per_blk = THREADS_PER_BLK;
00048         int           num_blks        = (num_flows + threads_per_blk - 1) / threads_per_blk;
00049         cuda_mem_pool *pool           = dev_ctx_->get_cuda_mem_pool(stream_id);
00050 
00051         memory_d = pool->alloc(tot_in_len);
00052         cudaMemcpyAsync(memory_d, memory_start, tot_in_len,
00053                         cudaMemcpyHostToDevice, dev_ctx_->get_stream(stream_id));
00054 
00055         dev_ctx_->clear_checkbits(stream_id, num_blks);
00056 
00057         streams[stream_id].out_d = (uint8_t*)pool->alloc(tot_out_len);
00058 
00059         in_d         = (uint8_t *) memory_d + in_pos;
00060         keys_d       = (uint8_t *) memory_d + keys_pos;
00061         ivs_d        = (uint8_t *) memory_d + ivs_pos;
00062         pkt_offset_d = (uint32_t *) ((uint8_t *)memory_d + pkt_offset_pos);
00063 
00064         /* Call cbc kernel function to do encryption */
00065         if (dev_ctx_->use_stream()) {
00066                 AES_cbc_128_encrypt_gpu(in_d,
00067                                         streams[stream_id].out_d,
00068                                         pkt_offset_d,
00069                                         keys_d,
00070                                         ivs_d,
00071                                         num_flows,
00072                                         dev_ctx_->get_dev_checkbits(stream_id),
00073                                         threads_per_blk,
00074                                         dev_ctx_->get_stream(stream_id));
00075         } else {
00076                 AES_cbc_128_encrypt_gpu(in_d,
00077                                         streams[stream_id].out_d,
00078                                         pkt_offset_d,
00079                                         keys_d,
00080                                         ivs_d,
00081                                         num_flows,
00082                                         dev_ctx_->get_dev_checkbits(stream_id),
00083                                         threads_per_blk);
00084 
00085         }
00086 
00087         assert(cudaGetLastError() == cudaSuccess);
00088 
00089         streams[stream_id].out     = out;
00090         streams[stream_id].out_len = tot_out_len;
00091 
00092         /* Copy data back from device to host */
00093         if (!dev_ctx_->use_stream()) {
00094                 sync(stream_id);
00095         }
00096 }

Here is the call graph for this function:

void aes_context::ecb_128_encrypt ( const void *  memory_start,
const unsigned long  in_pos,
const unsigned long  keys_pos,
const unsigned long  pkt_index_pos,
const unsigned long  data_size,
unsigned char *  out,
const unsigned long  block_count,
const unsigned long  num_flows,
unsigned int  stream_id 
)

It executes AES-ECB-128 encryption in GPU. If stream is enabled it will run in non-blocking mode, if not, it will run in blocking mode and the result will be written back to out at the end of function call. This function takes one or more plaintext and returns ciphertext for all of them.

Parameters:
memory_start Starting point of input data.
in_pos Offset of plain texts. All plain texts are gathered in to one big buffer with 16-byte align which is AES block size.
keys_pos Offset of region that stores keys.
pkt_index_pos Offset of region that stores per plain text index for every block.
data_size Total amount of input data.
out Buffer to store output.
block_count Total number of blocks in all plain texts.
num_flows Total number of plain text.
stream_id Stream index.

00156 {
00157         assert(dev_ctx_->get_state(stream_id) == READY);
00158 
00159         unsigned long total_len = block_count * AES_BLOCK_SIZE;
00160         uint8_t *in_d;
00161         uint8_t *keys_d;
00162         uint16_t *pkt_index_d;
00163         void * memory_d;
00164 
00165         cuda_mem_pool *pool = dev_ctx_->get_cuda_mem_pool(stream_id);
00166         memory_d = pool->alloc(data_size);
00167         cudaMemcpyAsync(memory_d, memory_start, data_size,
00168                         cudaMemcpyHostToDevice, dev_ctx_->get_stream(stream_id));
00169 
00170         streams[stream_id].out_d = (uint8_t *) pool->alloc(total_len);
00171 
00172         in_d        = (uint8_t *) memory_d + in_pos;
00173         keys_d      = (uint8_t *) memory_d + keys_pos;
00174         pkt_index_d = (uint16_t *) ((uint8_t *) memory_d + pkt_index_pos);
00175 
00176         unsigned int threads_per_blk = THREADS_PER_BLK;
00177 
00178         if (dev_ctx_->use_stream())
00179                 AES_ecb_128_encrypt_gpu(in_d,
00180                                         streams[stream_id].out_d,
00181                                         keys_d,
00182                                         pkt_index_d,
00183                                         block_count,
00184                                         threads_per_blk,
00185                                         dev_ctx_->get_stream(stream_id));
00186         else
00187                 AES_ecb_128_encrypt_gpu(in_d,
00188                                         streams[stream_id].out_d,
00189                                         keys_d,
00190                                         pkt_index_d,
00191                                         block_count,
00192                                         threads_per_blk);
00193 
00194         dev_ctx_->set_state(stream_id, WAIT_KERNEL);
00195 
00196         streams[stream_id].out     = out;
00197         streams[stream_id].out_len = total_len;
00198 
00199         if (!dev_ctx_->use_stream()) {
00200                 sync(stream_id);
00201         }
00202 }

Here is the call graph for this function:

void aes_context::ecb_128_encrypt_nocopy ( const void *  memory_start,
const unsigned long  in_pos,
const unsigned long  keys_pos,
const unsigned long  pkt_index_pos,
const unsigned long  data_size,
unsigned char *  out,
const unsigned long  block_count,
const unsigned long  num_flows,
unsigned int  stream_id 
)

It executes AES-ECB-128 encryption in GPU. This function differs from ecb_128_encrypt in that it will not involve any data copy from/to GPU. This was intended to measure purely AES computation performance using GPU.

Parameters:
memory_start Starting point of input data.
in_pos Offset of plain texts. All plain texts are gathered in to one big buffer with 16-byte align which is AES block size.
keys_pos Offset of region that stores keys.
pkt_index_pos Offset of region that stores per plain text index for every block.
data_size Total amount of input data.
out Buffer to store output.
block_count Total number of blocks in all plain texts.
num_flows Total number of plain text.
stream_id Stream index.

00213 {
00214         assert(dev_ctx_->get_state(stream_id) == READY);
00215 
00216         unsigned long total_len = block_count * AES_BLOCK_SIZE;
00217         uint8_t *in_d;
00218         uint8_t *keys_d;
00219         uint16_t *pkt_index_d;
00220         void * memory_d;
00221 
00222         cuda_mem_pool *pool = dev_ctx_->get_cuda_mem_pool(stream_id);
00223 
00224         memory_d                 = pool->alloc(data_size);
00225         streams[stream_id].out_d = (uint8_t*)pool->alloc(total_len);
00226         in_d                     = (uint8_t *) memory_d + in_pos;
00227         keys_d                   = (uint8_t *) memory_d + keys_pos;
00228         pkt_index_d              = (uint16_t *) ((uint8_t *) memory_d + pkt_index_pos);
00229 
00230         unsigned int threads_per_blk = THREADS_PER_BLK;
00231 
00232         if (dev_ctx_->use_stream())
00233                 AES_ecb_128_encrypt_gpu(in_d,
00234                                         streams[stream_id].out_d,
00235                                         keys_d,
00236                                         pkt_index_d,
00237                                         block_count,
00238                                         threads_per_blk,
00239                                         dev_ctx_->get_stream(stream_id));
00240         else
00241                 AES_ecb_128_encrypt_gpu(in_d,
00242                                         streams[stream_id].out_d,
00243                                         keys_d, pkt_index_d,
00244                                         block_count,
00245                                         threads_per_blk);
00246 
00247         assert(cudaGetLastError() == cudaSuccess);
00248         dev_ctx_->set_state(stream_id, WAIT_KERNEL);
00249 
00250         streams[stream_id].out     = out;
00251         streams[stream_id].out_len = total_len;
00252 
00253         if (!dev_ctx_->use_stream()) {
00254                 sync(stream_id);
00255         }
00256 }

Here is the call graph for this function:

bool aes_context::sync ( const unsigned int  stream_id,
const bool  block = true,
const bool  copy_result = true 
)

Synchronize/query the execution on the stream. This function can be used to check whether the current execution on the stream is finished or also be used to wait until the execution to be finished.

Parameters:
stream_id Stream index.
block Wait for the execution to finish or not. true by default.
copy_result If false, it will not copy result back to CPU.
Returns:
true if the current operation on the stream is finished otherwise false.

00101 {
00102         if (block) {
00103                 dev_ctx_->sync(stream_id, true);
00104                 if (copy_result && dev_ctx_->get_state(stream_id) == WAIT_KERNEL) {
00105                         cutilSafeCall(cudaMemcpyAsync(streams[stream_id].out,
00106                                                       streams[stream_id].out_d,
00107                                                       streams[stream_id].out_len,
00108                                                       cudaMemcpyDeviceToHost,
00109                                                       dev_ctx_->get_stream(stream_id)));
00110                         dev_ctx_->set_state(stream_id, WAIT_COPY);
00111                         dev_ctx_->sync(stream_id, true);
00112                         dev_ctx_->set_state(stream_id, READY);
00113                 } else if (dev_ctx_->get_state(stream_id) == WAIT_COPY) {
00114                         dev_ctx_->set_state(stream_id, READY);
00115                 }
00116                 return true;
00117         } else {
00118                 if (!dev_ctx_->sync(stream_id, false))
00119                         return false;
00120 
00121                 if (dev_ctx_->get_state(stream_id) == WAIT_KERNEL) {
00122                         //if no need for data copy
00123                         if (!copy_result) {
00124                                 dev_ctx_->set_state(stream_id, READY);
00125                                 return true;
00126                         }
00127 
00128                         cutilSafeCall(cudaMemcpyAsync(streams[stream_id].out,
00129                                                       streams[stream_id].out_d,
00130                                                       streams[stream_id].out_len,
00131                                                       cudaMemcpyDeviceToHost,
00132                                                       dev_ctx_->get_stream(stream_id)));
00133                         dev_ctx_->set_state(stream_id, WAIT_COPY);
00134 
00135                 } else if (dev_ctx_->get_state(stream_id) == WAIT_COPY) {
00136                         dev_ctx_->set_state(stream_id, READY);
00137                         return true;
00138                 } else if (dev_ctx_->get_state(stream_id) == READY) {
00139                         return true;
00140                 } else {
00141                         assert(0);
00142                 }
00143         }
00144         return false;
00145 }

Here is the call graph for this function:

Here is the caller graph for this function:

 All Data Structures Functions
Generated on Tue Oct 18 10:20:20 2011 for libgpucrypto by  doxygen 1.6.3