diff --git a/README.md b/README.md index a1d2c95..c5d1610 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,14 @@ DataSmoke Datatype detection in order to choose appropriate compression algorithm. -Since already compressed, text and multimedia files are better compressed with specific algorithms, we need a fast and reliable way to detect those data. I call it data smoking. +Since incompresible, text and multimedia files are better compressed with specific algorithms, we need a fast and reliable way to detect those data. I call it data smoking. This project will provide various experimental algorithms that can recognize some of special datatypes (not necessary all), as well as samples of data that are especially hard to smoke correctly. -The full list of smells: +The full list of smells (speeds measured on the single core of i7-4770): - ByteSmoker: computes entropy of individual bytes (2 GB/s). -- WordSmoker: computes entropy of 16-bit words (1 GB/s). +- WordSmoker: computes entropy of 16-bit words (0.7-1.5 GB/s). - DWordSmoker: computes entropy of 32-bit dwords (3 GB/s). +- Order1Smoker: computes order-1 entropy of 8-bit bytes (0.7-1.5 GB/s). diff --git a/smoke.cpp b/smoke.cpp index 2d70663..1c0a958 100644 --- a/smoke.cpp +++ b/smoke.cpp @@ -70,7 +70,6 @@ void ByteSmoker::smoke (void *buf, size_t bufsize, double *entropy) class WordSmoker : public Smoker { uint32_t *count; - size_t bits[256]; public: WordSmoker() {count = new uint32_t[256*256];} virtual const char* name() {return "WordSmoker";}; @@ -84,7 +83,7 @@ void WordSmoker::smoke (void *buf, size_t bufsize, double *entropy) byte *p = (byte*) buf; for (int i=0; i1?"\n":"", argv[file]); - ByteSmoker ByteS; - WordSmoker WordS; - DWordSmoker DWordS; - Smoker *smokers[] = {&ByteS, &WordS, &DWordS}; + ByteSmoker ByteS; + WordSmoker WordS; + DWordSmoker DWordS; + Order1Smoker Order1S; + Smoker *smokers[] = {&ByteS, &WordS, &Order1S, &DWordS}; const int NumSmokers = sizeof(smokers)/sizeof(*smokers); double entropy, min_entropy[NumSmokers], avg_entropy[NumSmokers] = {0}, max_entropy[NumSmokers] = {0}; for (int i=0; i