浅谈Zstandard压缩算法

admin 2024年9月17日16:09:58评论7 views字数 15916阅读53分3秒阅读模式

目录

  • LZ77字典匹配
  • Huffman 编码
  • Zstandard
  • reference
  • 后记

LZ77字典匹配

算法步骤

  1. 设置编码位置为输入流的开始
  2. 在滑窗的待编码区查找搜索区中的最大匹配字符串
  3. 如果找到字符串,输出(偏移值, 匹配长度), 窗口向前滑动“匹配长度”
  4. 如果没有找到,输出(0, 0, 待编码区的第一个字符),窗口向前滑动一个单位
  5. 如果待编码区不为空,回到步骤2

代码实现

#include <iostream>
#include <string>
#include <vector>
#include <utility>

class LZ77 {
public:
LZ77(const std::string& inputStr)
: inputStr(inputStr), searchSize(5), aheadSize(3), windSpiltIndex(0), move(0), notFind(-1) {}

void encoding() {
int step = 0;
std::cout << "Step   Position   Match   Output" << std::endl;
while (!isLookHeadEmpty()) {
winMove();
std::pair<intint> matchResult = findMaxMatch();
int offset = matchResult.first;
int matchLen = matchResult.second;
setMoveSteps(matchLen);

if (matchLen == 0) {
char nextChar = inputStr[windSpiltIndex];
std::cout << step << "      " << windSpiltIndex << "           -     "
<< "(0,0," << nextChar <<")"<< std::endl;
}
else {
std::string matchStr = inputStr.substr(windSpiltIndex - offset, matchLen);
std::cout << step << "      " << windSpiltIndex << "           " << matchStr << "     "
<< "(" << offset << ", " << matchLen << ")" << std::endl;
}
step++;
}
}

private:
std::string inputStr;
int searchSize;
int aheadSize;
int windSpiltIndex;
int move;
const int notFind;

int getWinEndIndex() const {
return windSpiltIndex + aheadSize;
}

int getWinStartIndex() const {
return windSpiltIndex - searchSize;
}

bool isLookHeadEmpty() const {
return windSpiltIndex + move > inputStr.size() - 1;
}

void winMove() {
windSpiltIndex += move;
}

std::pair<intintfindMaxMatch() {
int matchLen = 0;
int offset = 0;
int minEdge = calculateMinEdge();
for (int i = windSpiltIndex + 1; i <=minEdge; i++) {
int offsetTemp = searchBufferOffset(i);
if (offsetTemp == notFind) {
return std::make_pair(offset, matchLen);
}
offset = offsetTemp;
matchLen++;
}
return std::make_pair(offset, matchLen);
}

int searchBufferOffset(int i) const {
int searchStart = getWinStartIndex();
int searchEnd = windSpiltIndex;
if (searchEnd < 1) {
return notFind;
}
if (searchStart < 0) {
searchStart = 0;
if (searchEnd == 0) {
searchEnd = 1;
}
}
std::string searchStr = inputStr.substr(searchStart, searchEnd - searchStart);
std::string searchFor = inputStr.substr(windSpiltIndex, i - windSpiltIndex);
size_t findIndex = searchStr.find(searchFor);
if (findIndex == std::string::npos) {
return notFind;
}
return searchStr.size() - findIndex;
}

void setMoveSteps(int matchLen) {
//move = (matchLen == 0) ? 1 : matchLen;
move = std::max(1, matchLen);

}

int calculateMinEdge() const {
return std::min(int (inputStr.size()), getWinEndIndex());
}
};

int main() {
LZ77 lz77("AABCBBABCAB");
lz77.encoding();
return 0;
}

浅谈Zstandard压缩算法

Huffman 编码

#include <iostream>
#include <vector>
#include <queue>
#include <unordered_map>
#include <fstream>
#include <bitset>

// Node structure for the Huffman Tree
struct Node {
char ch;
int freq;
Node* left;
Node* right;

Node(char c, int f) : ch(c), freq(f), left(nullptr), right(nullptr) {}
};

// Compare structure for the priority queue
struct Compare {
bool operator()(Node* left, Node* right) {
return left->freq > right->freq;//定义最小堆
}
};

// Build the Huffman Tree
Node* buildHuffmanTree(const std::unordered_map<charint>& freqMap) {
std::priority_queue<Node*, std::vector<Node*>, Compare> pq;

// Create a leaf node for each character and add it to the priority queue
for (auto pair : freqMap) {
pq.push(new Node(pair.first, pair.second));
}

// Iterate until the queue contains only one node (the root)
while (pq.size() > 1) {
Node* left = pq.top(); pq.pop();
Node* right = pq.top(); pq.pop();

// Create a new internal node with frequency equal to the sum of the two nodes
Node* internalNode = new Node('�', left->freq + right->freq);
internalNode->left = left;
internalNode->right = right;

pq.push(internalNode);
}

// The remaining node is the root of the Huffman Tree
return pq.top();
}

// Traverse the Huffman Tree and store codes in a map
void generateCodes(Node* root, std::unordered_map<charstd::string>& huffmanCode, std::string code) {
if (!root) return;

// If this is a leaf node, store the character and its code
if (root->ch != '�') {
huffmanCode[root->ch] = code;
}

// Traverse the left and right children
generateCodes(root->left, huffmanCode, code + "0");
generateCodes(root->right, huffmanCode, code + "1");
}

// Encode the input string using the Huffman codes
std::string encode(const std::string& text, const std::unordered_map<charstd::string>& huffmanCode) {
std::string encodedString;
for (char ch : text) {
encodedString += huffmanCode.at(ch);
}
return encodedString;
}

// Decode the encoded string using the Huffman Tree
std::string decode(const std::string& encodedString, Node* root) {
std::string decodedString;
Node* currentNode = root;

for (char bit : encodedString) {
if (bit == '0') {
currentNode = currentNode->left;
}
else {
currentNode = currentNode->right;
}

// If we reach a leaf node, append the character to the decoded string
if (currentNode->left == nullptr && currentNode->right == nullptr) {
decodedString += currentNode->ch;
currentNode = root;
}
}

return decodedString;
}
void freeTree(Node* root) {
if (root == nullptr) {
return;
}
freeTree(root->left);
freeTree(root->right);
delete root;
}
void writeBitStreamToFile(const std::string& bitStream, const std::string& filename) {
std::ofstream outFile(filename, std::ios::binary);
char buffer = 0;
int bitCount = 0;

for (char bit : bitStream) {
buffer <<= 1;
if (bit == '1') {
buffer |= 1;
}
bitCount++;

// 当收集到 8 位时,将字节写入文件
if (bitCount == 8) {
outFile.put(buffer);
buffer = 0;
bitCount = 0;
}
}

// 如果最后不满8位,补齐0再写入
if (bitCount > 0) {
buffer <<= (8 - bitCount);  // 将剩余位数补齐到8位
outFile.put(buffer);
}

outFile.close();
}
std::string decodeFromFile(const std::string& filename,std::unordered_map<std::stringchar> reverseHuffmanCode)
{
std::ifstream inFile(filename, std::ios::binary);
if (!inFile.is_open()) {
std::cerr << "Error: Cannot open file!" << std::endl;
return "";
}

std::string bitStream = "";
char buffer;
while (inFile.get(buffer)){
std::bitset bites= std::bitset<8>(buffer);
bitStream += bites.to_string();
// 读取每个字节,并将其转换为位串
}

inFile.close();

std::string decodedText = "";
std::string currentBits = "";
for (char bit : bitStream) {
currentBits += bit;

// 一旦找到编码匹配字符,解码该字符
if (reverseHuffmanCode.find(currentBits) != reverseHuffmanCode.end()) {
decodedText += reverseHuffmanCode[currentBits];
currentBits = "";  // 重置
}
}

return decodedText;
}
int main() {
std::string text = "Huffman coding is a data compression algorithm";

// Step 1: Frequency Count
std::unordered_map<charint> freqMap;
for (char ch : text) {
freqMap[ch]++;
}

// Step 2: Build the Huffman Tree
Node* root = buildHuffmanTree(freqMap);

// Step 3: Generate Huffman Codes
std::unordered_map<charstd::string> huffmanCode;
generateCodes(root, huffmanCode, "");

// Display the Huffman codes
std::cout << "Huffman Codes:n";
for (auto pair : huffmanCode) {
std::cout << pair.first << ": " << pair.second << 'n';
}

// Step 4: Encode the input string
std::string encodedString = encode(text, huffmanCode);
std::cout << "nEncoded String: " << encodedString << 'n';
writeBitStreamToFile(encodedString, "out.bin");

// Step 5 (Optional): Decode the encoded string
std::unordered_map<std::stringchar> rehaffman;
for (auto it : huffmanCode) {
rehaffman.insert(std::make_pair(it.second, it.first));
}
std::string newstring = decodeFromFile("out.bin", rehaffman);

//std::string decodedString = decode(encodedString, root);
std::cout << "nDecoded String: " << newstring << "n";

return 0;
}

Huffman Codes: a: 011 i: 000 g: 0010 m: 1000 e: 00110 l: 00111 s: 0101 c: 0100 n: 1001 : 101 o: 1100 f: 11010 t: 11011 H: 111000 p: 111001 r: 11101 d: 11110 h: 111110 u: 111111

Encoded String: 1110001111111101011010100001110011010100110011110000100100101010000101101011101111100111101101110101001100100011100111101001100101010100011001001101011001110010110011101000110111111101000

Decoded String: Huffman coding is a data compression algorithmi

E3 FD 6A 1C D4 CF 09 2A 16 BB E7 B7 53 23 9E 99 54 64 D6 72 CE 8D FD 00

map/unordered_map

  1. Underlying Data Structure:
  • map: Implemented as a self-balancing binary search tree (typically a Red-Black tree). This means that the elements are stored in a sorted order based on the keys.
  • unordered_map: Implemented as a hash table. Elements are stored in no particular order, but operations (insertion, lookup) are generally faster.
  1. Time Complexity:
  • map: All operations (insertion, deletion, search) take O(log⁡n) time due to the tree structure.
  • unordered_map: Average time complexity for most operations is O(1) because of the hash table. However, in the worst case (due to hash collisions), it can degrade to O(n).
  1. Order of Elements:
  • map: Elements are stored in sorted order based on their keys, which can be useful when you need sorted data.
  • unordered_map: There is no guaranteed order of the elements.
  1. Memory Usage:
  • map: Uses more memory because of the tree structure overhead (pointers to left and right child nodes).
  • unordered_map: Uses extra memory for maintaining the hash table and handling hash collisions (e.g., chaining or open addressing).
  1. When to Use map:
  • When you need the keys to be sorted (e.g., iterating over keys in order).
  • When the cost of log-time complexity is acceptable, and you need stable performance.
  • When insertion order or lookup order matters.
  1. When to Use unordered_map:
  • When you don't need the keys to be sorted.
  • When you prioritize faster access (constant time average performance).
  • When you are working with large datasets and need efficient lookups.

find/[]/at

  • find:判断find的返回结果,才知道有没有
  • []:不管有没有,都是有。没有就是0,字符串就是空(自动创建)
  • at:如果存在,则返回它的值,如果不存在,则抛出异常

Zstandard

api

/*! ZSTD_compress() :
 *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
 *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
 *  @return : compressed size written into `dst` (<= `dstCapacity),
 *            or an error code if it fails (which can be tested using ZSTD_isError()). */
ZSTDLIB_API size_t ZSTD_compressvoid* dst, size_t dstCapacity,
                            const void* src, size_t srcSize,
                                  int compressionLevel);

/*! ZSTD_decompress() :
*  `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
*  `dstCapacity` is an upper bound of originalSize.
*  If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
*  @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
*            or an errorCode if it fails (which can be tested using ZSTD_isError()). */

ZSTDLIB_API size_t ZSTD_decompressvoid* dst, size_t dstCapacity,
const void* src, size_t compressedSize)
;

ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);

/*!< maximum compressed size in worst case scenario */
ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize);

压缩

#include<iostream>
#include<windows.h>
#include<zstd.h>

int main() {
IN HANDLE hFile;
OUT DWORD FileReadSize;
IN DWORD dwFileSize;
IN PVOID RemoteImageBase;
IN PVOID input;
const char* path = "D:\ant me\Ant Messenger\Ant Messenger.exe";
hFile = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, 0NULL);
dwFileSize = GetFileSize(hFile, NULL);
input = VirtualAlloc(NULL, dwFileSize, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
ReadFile(hFile, input, dwFileSize, &FileReadSize, NULL);
CloseHandle(hFile);

// 计算压缩后的最大缓冲区大小
size_t compressed_size_bound = ZSTD_compressBound(dwFileSize);
char* compressed_data = new char[compressed_size_bound];

// 压缩数据
size_t compressed_size = ZSTD_compress(compressed_data, compressed_size_bound, input, dwFileSize, 1);
if (ZSTD_isError(compressed_size)) {
std::cerr << "Compression failed: " << ZSTD_getErrorName(compressed_size) << std::endl;
delete[] compressed_data;
return 1;
}

std::cout << "Original size: " << dwFileSize << std::endl;
std::cout << "Compressed size: " << compressed_size << std::endl;
const char* path2 = "D:\ant me\Ant Messenger\coleak.exe";
HANDLE hFile2 = CreateFileA(path2, FILE_ALL_ACCESS, FILE_SHARE_READ, NULL, CREATE_ALWAYS, 0NULL);
WriteFile(hFile2, compressed_data, compressed_size, &FileReadSize, NULL);

delete[] compressed_data;
return 0;
}

ZSTD_compressBound->ZSTD_compress

解压缩

#include <iostream>
#include <windows.h>
#include <zstd.h>

int main() {
IN HANDLE hFile;
OUT DWORD FileReadSize;
IN DWORD dwFileSize;
IN PVOID RemoteImageBase;
IN PVOID input;

// 打开压缩文件
const char* path = "D:\ant me\Ant Messenger\coleak.exe";
hFile = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, 0NULL);
dwFileSize = GetFileSize(hFile, NULL); // 获取压缩文件的大小
input = VirtualAlloc(NULL, dwFileSize, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
ReadFile(hFile, input, dwFileSize, &FileReadSize, NULL);
CloseHandle(hFile);

// 先计算解压后所需的最大缓冲区大小
unsigned long long decompressed_size = ZSTD_getFrameContentSize(input, dwFileSize);
if (decompressed_size == ZSTD_CONTENTSIZE_ERROR) {
std::cerr << "Not compressed by zstd!" << std::endl;
return 1;
}
else if (decompressed_size == ZSTD_CONTENTSIZE_UNKNOWN) {
std::cerr << "Original size is unknown!" << std::endl;
return 1;
}

// 分配解压后的数据缓冲区
char* decompressed_data = new char[decompressed_size];

// 解压数据
size_t result = ZSTD_decompress(decompressed_data, decompressed_size, input, dwFileSize);
if (ZSTD_isError(result)) {
std::cerr << "Decompression failed: " << ZSTD_getErrorName(result) << std::endl;
delete[] decompressed_data;
return 1;
}

std::cout << "Decompression successful! Original size: " << decompressed_size << std::endl;

const char* path2 = "D:\ant me\Ant Messenger\coleak2.exe";
HANDLE hFile2 = CreateFileA(path2, FILE_ALL_ACCESS, FILE_SHARE_READ, NULL, CREATE_ALWAYS, 0NULL);
WriteFile(hFile2, decompressed_data, decompressed_size, &FileReadSize, NULL);
delete[] decompressed_data;
return 0;
}

ZSTD_getFrameContentSize->ZSTD_decompress

压缩等级

它的压缩级别从负5级(最快)到22级(压缩速度最慢,但是压缩比最高)可以调节

上面的介绍来自维基百科,但是实测压缩等级可以设置在区间外,但是相差越远越不明显

-5:压缩速度极快,压缩比低,占用较少的内存
22:非常慢的压缩速度。 最佳压缩比,最大限度减少文件大小,适合数据冷存储。
随着级别的增加,压缩速度减慢,但压缩比增高,内存占用也随着压缩级别的增加而增加。

reference

https://tech.meituan.com/2021/01/07/pack-gzip-zstd-lz4.html
https://learn.microsoft.com/zh-cn/windows/win32/cmpapi/using-the-compression-api
https://zh.wikipedia.org/wiki/Zstandard
https://xwxz.github.io/algorithm-LZ77/

后记

ifstream

Open Modes

std::ios::in: Default mode for ifstream, which opens the file for input.

std::ios::binary: Open in binary mode (useful for non-text files).

std::ios::ate: Move the file pointer to the end of the file after opening.

std::ios::app: Append mode.

std::ios::trunc: If the file exists, its contents are truncated (deleted).

#include <iostream>
#include <fstream>
#include <string>

int main() {
std::ifstream inputFile("a.txt");

if (!inputFile.is_open()) {
std::cerr << "Failed to open file!" << std::endl;
return 1;
}

std::string line;
//while (std::getline(inputFile, line)) {
//    std::cout << line << std::endl;
//}
char ch;
while (inputFile.get(ch))
{
std::cout << ch;
}
if (inputFile.eof()) {
std::cout << "End of file reached." << std::endl;
}

else if (inputFile.fail()) {
std::cerr << "Error while reading file!" << std::endl;
}

inputFile.close();
return 0;
}

ofstream

Open Modes

std::ios::out (default for ofstream): Opens the file for writing.

std::ios::app: Opens the file for appending. Data is written to the end of the file.

std::ios::trunc: If the file exists, it is truncated to zero length (default).

std::ios::binary: Opens the file in binary mode (for binary files).

std::ios::ate: Opens the file and moves the file pointer to the end of the file.

#include <fstream>

int main() {
// Opening a file in append mode
std::ofstream outFile("log.txt"std::ios::app);
outFile << "Appending some text.n";
outFile.flush(); // Data is written to the file without closing it
outFile.put('H');   // Write 'H' to the file
outFile.put('e');   // Write 'e' to the file
outFile.put('l');   // Write 'l' to the file
outFile.put('l');   // Write 'l' to the file
outFile.put('o');   // Write 'o' to the file
outFile.put('!');   // Write '!' to the file
outFile.close();

// Opening a file in binary mode
std::ofstream binFile("data.bin"std::ios::binary);
const char c[10] = { 0x10,0x48,0x5f };
binFile.write(c, 10);  // Writing raw bytes
binFile.close();

return 0;
}

函数表

Function Description
ofstream outFile Constructor to create and open a file for writing.
outFile.is_open() Checks if the file was opened successfully.
outFile << "text" Writes text or data to the file.
outFile.close() Closes the file.
outFile.flush() Flushes the buffer to the file.
outFile.write() Writes binary data to the file.
outFile.fail() Checks if a write operation failed.
outFile.bad() Checks for serious errors (e.g., hardware issues).
outFile.good() Checks if the stream is in a good state.

文章首发于:渗透测试安全攻防

原文始发于微信公众号(渗透测试安全攻防):浅谈Zstandard压缩算法

  • 左青龙
  • 微信扫一扫
  • weinxin
  • 右白虎
  • 微信扫一扫
  • weinxin
admin
  • 本文由 发表于 2024年9月17日16:09:58
  • 转载请保留本文链接(CN-SEC中文网:感谢原作者辛苦付出):
                   浅谈Zstandard压缩算法https://cn-sec.com/archives/3175178.html

发表评论

匿名网友 填写信息