aes_context Class Reference

#include <aes_context.hh>

Collaboration diagram for aes_context:

Public Member Functions
	aes_context (device_context *dev_ctx)
void	ecb_128_encrypt (const void memory_start, const unsigned long in_pos, const unsigned long keys_pos, const unsigned long pkt_index_pos, const unsigned long data_size, unsigned char out, const unsigned long block_count, const unsigned long num_flows, unsigned int stream_id)
void	ecb_128_encrypt_nocopy (const void memory_start, const unsigned long in_pos, const unsigned long keys_pos, const unsigned long pkt_index_pos, const unsigned long data_size, unsigned char out, const unsigned long block_count, const unsigned long num_flows, unsigned int stream_id)
void	cbc_encrypt (const void memory_start, const unsigned long in_pos, const unsigned long keys_pos, const unsigned long ivs_pos, const unsigned long pkt_offset_pos, const unsigned long tot_in_len, unsigned char out, const unsigned long num_flows, const unsigned long tot_out_len, const unsigned int stream_id, const unsigned int bits=128)
void	cbc_decrypt (const void memory_start, const unsigned long in_pos, const unsigned long keys_pos, const unsigned long ivs_pos, const unsigned long pkt_index_pos, const unsigned long data_size, unsigned char out, const unsigned long block_count, const unsigned long num_flows, unsigned int stream_id, const unsigned int bits=128)
bool	sync (const unsigned int stream_id, const bool block=true, const bool copy_result=true)

Detailed Description

class aes_context

Interface for AES cipher in GPU. It supports 128-bit CBC-mode encryption/decryptiion, ecb 128 bit encryption.

Constructor & Destructor Documentation

aes_context::aes_context ( device_context * dev_ctx )

Constructior.

Parameters:

dev_ctx

Device context pointer. Device context must be initialized before calling this function.

00011 {
00012         dev_ctx_ = dev_ctx;
00013         for (unsigned i = 0; i <= MAX_STREAM; i++) {
00014                 streams[i].out = 0;
00015                 streams[i].out_d = 0;
00016                 streams[i].out_len = 0;
00017         }
00018 }

Member Function Documentation

void aes_context::cbc_decrypt	(	const void *	memory_start,
		const unsigned long	in_pos,
		const unsigned long	keys_pos,
		const unsigned long	ivs_pos,
		const unsigned long	pkt_index_pos,
		const unsigned long	data_size,
		unsigned char *	out,
		const unsigned long	block_count,
		const unsigned long	num_flows,
		unsigned int	stream_id,
		const unsigned int	bits = `128`
	)

It executes AES-CBC encryption in GPU.

It only supports 128-bit key length at the moment.

If stream is enabled it will run in non-blocking mode, if not, it will run in blocking mode and the result will be written back to out at the end of function call. This function takes one or more plaintext and returns ciphertext for all of them.

Parameters:

	memory_start	Starting point of input data.
	in_pos	Offset of plain texts. All plain texts are gathered in to one big buffer with 16-byte align which is AES block size.
	keys_pos	Offset of region that stores keys.
	ivs_pos	Offset of region that store IVs.
	pkt_index_pos	Offset of region that stores per plain text index for every block. It is used to locate key an IV.
	data_size	Total amount of input data.
	out	Buffer to store output.
	block_count	Total number of blocks in all plain texts.
	num_flows	Total number of plain text.
	stream_id	Stream index.
	bits	key length for AES cipher

00271 00272 00273 00274 00275 00276 00277 00278 00279 00280 00281 00282 00283 00284 00285 00286 00287 00288 00289 00290         } 00291 00292 00293 00294 00295 00296 00297 00298 00299 00300 00301 00302 00303 00304 00305 00306 00307 00308 00309 00310 00311 00312 00313 00314 00315 00316 00317 00318 00319 00320 00321 00322 00323 00324 00325 00326 00327 00328 00329 00330 00331         } 00332 00333 00334 00335 00336 00337 00338 00339 00340         } 00341 }

class="fragment">00270 { assert(bits==128); assert(dev_ctx_->get_state(stream_id) == READY); dev_ctx_->set_state(stream_id, WAIT_KERNEL); unsigned long total_len = block_count * AES_BLOCK_SIZE; uint8_t *in_d; uint8_t *keys_d; uint8_t *ivs_d; uint16_t *pkt_index_d; void *memory_d; unsigned int threads_per_blk = 512; int num_blks = (block_count + threads_per_blk - 1) / threads_per_blk; //transfor encrypt key to decrypt key //basically it generates round key and take the last //CUDA code will generate the round key in reverse order for (unsigned i = 0; i < num_flows; i++) { uint8_t *key = (uint8_t *)memory_start + keys_pos + i * bits / 8; AES_decrypt_key_prepare(key, key, bits); dev_ctx_->clear_checkbits(stream_id, num_blks); cuda_mem_pool *pool = dev_ctx_->get_cuda_mem_pool(stream_id); memory_d = pool->alloc(data_size); cudaMemcpyAsync(memory_d, memory_start, data_size, cudaMemcpyHostToDevice, dev_ctx_->get_stream(stream_id)); streams[stream_id].out_d = (uint8_t*)pool->alloc(total_len); in_d = (uint8_t *) memory_d + in_pos; keys_d = (uint8_t *) memory_d + keys_pos; ivs_d = (uint8_t *) memory_d + ivs_pos; pkt_index_d = (uint16_t *) ((uint8_t *) memory_d + pkt_index_pos); assert(bits == 128); if (dev_ctx_->use_stream() && stream_id > 0) { AES_cbc_128_decrypt_gpu(in_d, streams[stream_id].out_d, keys_d, ivs_d, pkt_index_d, block_count, dev_ctx_->get_dev_checkbits(stream_id), threads_per_blk, dev_ctx_->get_stream(stream_id)); } else if (stream_id == 0) { AES_cbc_128_decrypt_gpu(in_d, streams[stream_id].out_d, keys_d, ivs_d, pkt_index_d, block_count, dev_ctx_->get_dev_checkbits(stream_id), threads_per_blk); } else { assert(0); assert(cudaGetLastError() == cudaSuccess); streams[stream_id].out = out; streams[stream_id].out_len = total_len; if (!dev_ctx_->use_stream()) { sync(stream_id, true);

Here is the call graph for this function:

void aes_context::cbc_encrypt	(	const void *	memory_start,
		const unsigned long	in_pos,
		const unsigned long	keys_pos,
		const unsigned long	ivs_pos,
		const unsigned long	pkt_offset_pos,
		const unsigned long	tot_in_len,
		unsigned char *	out,
		const unsigned long	num_flows,
		const unsigned long	tot_out_len,
		const unsigned int	stream_id,
		const unsigned int	bits = `128`
	)

It executes AES-CBC encryption in GPU.

It only supports 128-bit key length at the moment.

If stream is enabled it will run in non-blocking mode, if not, it will run in blocking mode and the result will be written back to out at the end of function call. This function takes one or more plaintext and returns ciphertext for all of them.

Parameters:

	memory_start	Starting point of input data.
	in_pos	Offset of plain texts. All plain texts are gathered in to one big buffer with 16-byte align which is AES block size.
	keys_pos	Offset of region that stores keys.
	ivs_pos	Offset of region that store IVs.
	pkt_offset_pos	Offset of region that stores position of the beginning of each plain text.
	tot_in_len	Total amount of input data plus meta data.
	out	Buffer to store output.
	block_count	Total number of blocks in all plain texts.
	num_flows	Total number of plain text.
	tot_out_len	Total amount of output length.
	stream_id	Stream index.
	bits	key length for AES cipher

00034 {
00035         assert(bits == 128);
00036         assert(dev_ctx_->get_state(stream_id) == READY);
00037         dev_ctx_->set_state(stream_id, WAIT_KERNEL);
00038 
00039         /* Allocate memory on device */
00040         uint8_t *in_d;
00041         uint8_t *keys_d;
00042         uint8_t *ivs_d;
00043         uint32_t *pkt_offset_d;
00044         void *memory_d;
00045 
00046         //Calculate # of cuda blks required
00047         unsigned int  threads_per_blk = THREADS_PER_BLK;
00048         int           num_blks        = (num_flows + threads_per_blk - 1) / threads_per_blk;
00049         cuda_mem_pool *pool           = dev_ctx_->get_cuda_mem_pool(stream_id);
00050 
00051         memory_d = pool->alloc(tot_in_len);
00052         cudaMemcpyAsync(memory_d, memory_start, tot_in_len,
00053                         cudaMemcpyHostToDevice, dev_ctx_->get_stream(stream_id));
00054 
00055         dev_ctx_->clear_checkbits(stream_id, num_blks);
00056 
00057         streams[stream_id].out_d = (uint8_t*)pool->alloc(tot_out_len);
00058 
00059         in_d         = (uint8_t *) memory_d + in_pos;
00060         keys_d       = (uint8_t *) memory_d + keys_pos;
00061         ivs_d        = (uint8_t *) memory_d + ivs_pos;
00062         pkt_offset_d = (uint32_t *) ((uint8_t *)memory_d + pkt_offset_pos);
00063 
00064         /* Call cbc kernel function to do encryption */
00065         if (dev_ctx_->use_stream()) {
00066                 AES_cbc_128_encrypt_gpu(in_d,
00067                                         streams[stream_id].out_d,
00068                                         pkt_offset_d,
00069                                         keys_d,
00070                                         ivs_d,
00071                                         num_flows,
00072                                         dev_ctx_->get_dev_checkbits(stream_id),
00073                                         threads_per_blk,
00074                                         dev_ctx_->get_stream(stream_id));
00075         } else {
00076                 AES_cbc_128_encrypt_gpu(in_d,
00077                                         streams[stream_id].out_d,
00078                                         pkt_offset_d,
00079                                         keys_d,
00080                                         ivs_d,
00081                                         num_flows,
00082                                         dev_ctx_->get_dev_checkbits(stream_id),
00083                                         threads_per_blk);
00084 
00085         }
00086 
00087         assert(cudaGetLastError() == cudaSuccess);
00088 
00089         streams[stream_id].out     = out;
00090         streams[stream_id].out_len = tot_out_len;
00091 
00092         /* Copy data back from device to host */
00093         if (!dev_ctx_->use_stream()) {
00094                 sync(stream_id);
00095         }
00096 }

Here is the call graph for this function:

void aes_context::ecb_128_encrypt	(	const void *	memory_start,
		const unsigned long	in_pos,
		const unsigned long	keys_pos,
		const unsigned long	pkt_index_pos,
		const unsigned long	data_size,
		unsigned char *	out,
		const unsigned long	block_count,
		const unsigned long	num_flows,
		unsigned int	stream_id
	)

It executes AES-ECB-128 encryption in GPU. If stream is enabled it will run in non-blocking mode, if not, it will run in blocking mode and the result will be written back to out at the end of function call. This function takes one or more plaintext and returns ciphertext for all of them.

Parameters:

	memory_start	Starting point of input data.
	in_pos	Offset of plain texts. All plain texts are gathered in to one big buffer with 16-byte align which is AES block size.
	keys_pos	Offset of region that stores keys.
	pkt_index_pos	Offset of region that stores per plain text index for every block.
	data_size	Total amount of input data.
	out	Buffer to store output.
	block_count	Total number of blocks in all plain texts.
	num_flows	Total number of plain text.
	stream_id	Stream index.

00156 {
00157         assert(dev_ctx_->get_state(stream_id) == READY);
00158 
00159         unsigned long total_len = block_count * AES_BLOCK_SIZE;
00160         uint8_t *in_d;
00161         uint8_t *keys_d;
00162         uint16_t *pkt_index_d;
00163         void * memory_d;
00164 
00165         cuda_mem_pool *pool = dev_ctx_->get_cuda_mem_pool(stream_id);
00166         memory_d = pool->alloc(data_size);
00167         cudaMemcpyAsync(memory_d, memory_start, data_size,
00168                         cudaMemcpyHostToDevice, dev_ctx_->get_stream(stream_id));
00169 
00170         streams[stream_id].out_d = (uint8_t *) pool->alloc(total_len);
00171 
00172         in_d        = (uint8_t *) memory_d + in_pos;
00173         keys_d      = (uint8_t *) memory_d + keys_pos;
00174         pkt_index_d = (uint16_t *) ((uint8_t *) memory_d + pkt_index_pos);
00175 
00176         unsigned int threads_per_blk = THREADS_PER_BLK;
00177 
00178         if (dev_ctx_->use_stream())
00179                 AES_ecb_128_encrypt_gpu(in_d,
00180                                         streams[stream_id].out_d,
00181                                         keys_d,
00182                                         pkt_index_d,
00183                                         block_count,
00184                                         threads_per_blk,
00185                                         dev_ctx_->get_stream(stream_id));
00186         else
00187                 AES_ecb_128_encrypt_gpu(in_d,
00188                                         streams[stream_id].out_d,
00189                                         keys_d,
00190                                         pkt_index_d,
00191                                         block_count,
00192                                         threads_per_blk);
00193 
00194         dev_ctx_->set_state(stream_id, WAIT_KERNEL);
00195 
00196         streams[stream_id].out     = out;
00197         streams[stream_id].out_len = total_len;
00198 
00199         if (!dev_ctx_->use_stream()) {
00200                 sync(stream_id);
00201         }
00202 }

Here is the call graph for this function:

void aes_context::ecb_128_encrypt_nocopy	(	const void *	memory_start,
		const unsigned long	in_pos,
		const unsigned long	keys_pos,
		const unsigned long	pkt_index_pos,
		const unsigned long	data_size,
		unsigned char *	out,
		const unsigned long	block_count,
		const unsigned long	num_flows,
		unsigned int	stream_id
	)

It executes AES-ECB-128 encryption in GPU. This function differs from ecb_128_encrypt in that it will not involve any data copy from/to GPU. This was intended to measure purely AES computation performance using GPU.

Parameters:

	memory_start	Starting point of input data.
	in_pos	Offset of plain texts. All plain texts are gathered in to one big buffer with 16-byte align which is AES block size.
	keys_pos	Offset of region that stores keys.
	pkt_index_pos	Offset of region that stores per plain text index for every block.
	data_size	Total amount of input data.
	out	Buffer to store output.
	block_count	Total number of blocks in all plain texts.
	num_flows	Total number of plain text.
	stream_id	Stream index.

00213 {
00214         assert(dev_ctx_->get_state(stream_id) == READY);
00215 
00216         unsigned long total_len = block_count * AES_BLOCK_SIZE;
00217         uint8_t *in_d;
00218         uint8_t *keys_d;
00219         uint16_t *pkt_index_d;
00220         void * memory_d;
00221 
00222         cuda_mem_pool *pool = dev_ctx_->get_cuda_mem_pool(stream_id);
00223 
00224         memory_d                 = pool->alloc(data_size);
00225         streams[stream_id].out_d = (uint8_t*)pool->alloc(total_len);
00226         in_d                     = (uint8_t *) memory_d + in_pos;
00227         keys_d                   = (uint8_t *) memory_d + keys_pos;
00228         pkt_index_d              = (uint16_t *) ((uint8_t *) memory_d + pkt_index_pos);
00229 
00230         unsigned int threads_per_blk = THREADS_PER_BLK;
00231 
00232         if (dev_ctx_->use_stream())
00233                 AES_ecb_128_encrypt_gpu(in_d,
00234                                         streams[stream_id].out_d,
00235                                         keys_d,
00236                                         pkt_index_d,
00237                                         block_count,
00238                                         threads_per_blk,
00239                                         dev_ctx_->get_stream(stream_id));
00240         else
00241                 AES_ecb_128_encrypt_gpu(in_d,
00242                                         streams[stream_id].out_d,
00243                                         keys_d, pkt_index_d,
00244                                         block_count,
00245                                         threads_per_blk);
00246 
00247         assert(cudaGetLastError() == cudaSuccess);
00248         dev_ctx_->set_state(stream_id, WAIT_KERNEL);
00249 
00250         streams[stream_id].out     = out;
00251         streams[stream_id].out_len = total_len;
00252 
00253         if (!dev_ctx_->use_stream()) {
00254                 sync(stream_id);
00255         }
00256 }

Here is the call graph for this function:

bool aes_context::sync	(	const unsigned int	stream_id,
		const bool	block = `true`,
		const bool	copy_result = `true`
	)

Synchronize/query the execution on the stream. This function can be used to check whether the current execution on the stream is finished or also be used to wait until the execution to be finished.

Parameters:

	stream_id	Stream index.
	block	Wait for the execution to finish or not. true by default.
	copy_result	If false, it will not copy result back to CPU.

Returns:: true if the current operation on the stream is finished otherwise false.

00101 {
00102         if (block) {
00103                 dev_ctx_->sync(stream_id, true);
00104                 if (copy_result && dev_ctx_->get_state(stream_id) == WAIT_KERNEL) {
00105                         cutilSafeCall(cudaMemcpyAsync(streams[stream_id].out,
00106                                                       streams[stream_id].out_d,
00107                                                       streams[stream_id].out_len,
00108                                                       cudaMemcpyDeviceToHost,
00109                                                       dev_ctx_->get_stream(stream_id)));
00110                         dev_ctx_->set_state(stream_id, WAIT_COPY);
00111                         dev_ctx_->sync(stream_id, true);
00112                         dev_ctx_->set_state(stream_id, READY);
00113                 } else if (dev_ctx_->get_state(stream_id) == WAIT_COPY) {
00114                         dev_ctx_->set_state(stream_id, READY);
00115                 }
00116                 return true;
00117         } else {
00118                 if (!dev_ctx_->sync(stream_id, false))
00119                         return false;
00120 
00121                 if (dev_ctx_->get_state(stream_id) == WAIT_KERNEL) {
00122                         //if no need for data copy
00123                         if (!copy_result) {
00124                                 dev_ctx_->set_state(stream_id, READY);
00125                                 return true;
00126                         }
00127 
00128                         cutilSafeCall(cudaMemcpyAsync(streams[stream_id].out,
00129                                                       streams[stream_id].out_d,
00130                                                       streams[stream_id].out_len,
00131                                                       cudaMemcpyDeviceToHost,
00132                                                       dev_ctx_->get_stream(stream_id)));
00133                         dev_ctx_->set_state(stream_id, WAIT_COPY);
00134 
00135                 } else if (dev_ctx_->get_state(stream_id) == WAIT_COPY) {
00136                         dev_ctx_->set_state(stream_id, READY);
00137                         return true;
00138                 } else if (dev_ctx_->get_state(stream_id) == READY) {
00139                         return true;
00140                 } else {
00141                         assert(0);
00142                 }
00143         }
00144         return false;
00145 }

Here is the call graph for this function:

Here is the caller graph for this function:

aes_context Class Reference

Public Member Functions

Detailed Description

Constructor & Destructor Documentation

Member Function Documentation