浅谈Zstandard压缩算法

admin

145365
文章

119
评论

2024年9月17日16:09:58评论39 views字数 15916阅读53分3秒阅读模式

LZ77字典匹配
Huffman 编码
Zstandard
reference
后记

LZ77字典匹配

算法步骤

设置编码位置为输入流的开始
在滑窗的待编码区查找搜索区中的最大匹配字符串
如果找到字符串，输出(偏移值，匹配长度)，窗口向前滑动“匹配长度”
如果没有找到，输出(0, 0, 待编码区的第一个字符)，窗口向前滑动一个单位
如果待编码区不为空，回到步骤2

代码实现

#include <iostream>
#include <string>
#include <vector>
#include <utility>

class LZ77 {
public:
LZ77(const std::string& inputStr)
: inputStr(inputStr), searchSize(5), aheadSize(3), windSpiltIndex(0), move(0), notFind(-1) {}

void encoding() {
int step = 0;
std::cout << "Step Position Match Output" << std::endl;
while (!isLookHeadEmpty()) {
winMove();
std::pair<int, int> matchResult = findMaxMatch();
int offset = matchResult.first;
int matchLen = matchResult.second;
setMoveSteps(matchLen);

if (matchLen == 0) {
char nextChar = inputStr[windSpiltIndex];
std::cout << step << " " << windSpiltIndex << " - "
<< "(0,0," << nextChar <<")"<< std::endl;
}
else {
std::string matchStr = inputStr.substr(windSpiltIndex - offset, matchLen);
std::cout << step << " " << windSpiltIndex << " " << matchStr << " "
<< "(" << offset << ", " << matchLen << ")" << std::endl;
}
step++;
}
}

private:
std::string inputStr;
int searchSize;
int aheadSize;
int windSpiltIndex;
int move;
const int notFind;

int getWinEndIndex() const {
return windSpiltIndex + aheadSize;
}

int getWinStartIndex() const {
return windSpiltIndex - searchSize;
}

bool isLookHeadEmpty() const {
return windSpiltIndex + move > inputStr.size() - 1;
}

void winMove() {
windSpiltIndex += move;
}

std::pair<int, int> findMaxMatch() {
int matchLen = 0;
int offset = 0;
int minEdge = calculateMinEdge();
for (int i = windSpiltIndex + 1; i <=minEdge; i++) {
int offsetTemp = searchBufferOffset(i);
if (offsetTemp == notFind) {
return std::make_pair(offset, matchLen);
}
offset = offsetTemp;
matchLen++;
}
return std::make_pair(offset, matchLen);
}

int searchBufferOffset(int i) const {
int searchStart = getWinStartIndex();
int searchEnd = windSpiltIndex;
if (searchEnd < 1) {
return notFind;
}
if (searchStart < 0) {
searchStart = 0;
if (searchEnd == 0) {
searchEnd = 1;
}
}
std::string searchStr = inputStr.substr(searchStart, searchEnd - searchStart);
std::string searchFor = inputStr.substr(windSpiltIndex, i - windSpiltIndex);
size_t findIndex = searchStr.find(searchFor);
if (findIndex == std::string::npos) {
return notFind;
}
return searchStr.size() - findIndex;
}

void setMoveSteps(int matchLen) {
//move = (matchLen == 0) ? 1 : matchLen;
move = std::max(1, matchLen);

}

int calculateMinEdge() const {
return std::min(int (inputStr.size()), getWinEndIndex());
}
};

int main() {
LZ77 lz77("AABCBBABCAB");
lz77.encoding();
return 0;
}

浅谈Zstandard压缩算法

Huffman 编码

#include <iostream>
#include <vector>
#include <queue>
#include <unordered_map>
#include <fstream>
#include <bitset>

// Node structure for the Huffman Tree
struct Node {
char ch;
int freq;
Node* left;
Node* right;

Node(char c, int f) : ch(c), freq(f), left(nullptr), right(nullptr) {}
};

// Compare structure for the priority queue
struct Compare {
bool operator()(Node* left, Node* right) {
return left->freq > right->freq;//定义最小堆
}
};

// Build the Huffman Tree
Node* buildHuffmanTree(const std::unordered_map<char, int>& freqMap) {
std::priority_queue<Node*, std::vector<Node*>, Compare> pq;

// Create a leaf node for each character and add it to the priority queue
for (auto pair : freqMap) {
pq.push(new Node(pair.first, pair.second));
}

// Iterate until the queue contains only one node (the root)
while (pq.size() > 1) {
Node* left = pq.top(); pq.pop();
Node* right = pq.top(); pq.pop();

// Create a new internal node with frequency equal to the sum of the two nodes
Node* internalNode = new Node('�', left->freq + right->freq);
internalNode->left = left;
internalNode->right = right;

pq.push(internalNode);
}

// The remaining node is the root of the Huffman Tree
return pq.top();
}

// Traverse the Huffman Tree and store codes in a map
void generateCodes(Node* root, std::unordered_map<char, std::string>& huffmanCode, std::string code) {
if (!root) return;

// If this is a leaf node, store the character and its code
if (root->ch != '�') {
huffmanCode[root->ch] = code;
}

// Traverse the left and right children
generateCodes(root->left, huffmanCode, code + "0");
generateCodes(root->right, huffmanCode, code + "1");
}

// Encode the input string using the Huffman codes
std::string encode(const std::string& text, const std::unordered_map<char, std::string>& huffmanCode) {
std::string encodedString;
for (char ch : text) {
encodedString += huffmanCode.at(ch);
}
return encodedString;
}

// Decode the encoded string using the Huffman Tree
std::string decode(const std::string& encodedString, Node* root) {
std::string decodedString;
Node* currentNode = root;

for (char bit : encodedString) {
if (bit == '0') {
currentNode = currentNode->left;
}
else {
currentNode = currentNode->right;
}

// If we reach a leaf node, append the character to the decoded string
if (currentNode->left == nullptr && currentNode->right == nullptr) {
decodedString += currentNode->ch;
currentNode = root;
}
}

return decodedString;
}
void freeTree(Node* root) {
if (root == nullptr) {
return;
}
freeTree(root->left);
freeTree(root->right);
delete root;
}
void writeBitStreamToFile(const std::string& bitStream, const std::string& filename) {
std::ofstream outFile(filename, std::ios::binary);
char buffer = 0;
int bitCount = 0;

for (char bit : bitStream) {
buffer <<= 1;
if (bit == '1') {
buffer |= 1;
}
bitCount++;

// 当收集到 8 位时，将字节写入文件
if (bitCount == 8) {
outFile.put(buffer);
buffer = 0;
bitCount = 0;
}
}

// 如果最后不满8位，补齐0再写入
if (bitCount > 0) {
buffer <<= (8 - bitCount); // 将剩余位数补齐到8位
outFile.put(buffer);
}

outFile.close();
}
std::string decodeFromFile(const std::string& filename,std::unordered_map<std::string, char> reverseHuffmanCode)
{
std::ifstream inFile(filename, std::ios::binary);
if (!inFile.is_open()) {
std::cerr << "Error: Cannot open file!" << std::endl;
return "";
}

std::string bitStream = "";
char buffer;
while (inFile.get(buffer)){
std::bitset bites= std::bitset<8>(buffer);
bitStream += bites.to_string();
// 读取每个字节，并将其转换为位串
}

inFile.close();

std::string decodedText = "";
std::string currentBits = "";
for (char bit : bitStream) {
currentBits += bit;

// 一旦找到编码匹配字符，解码该字符
if (reverseHuffmanCode.find(currentBits) != reverseHuffmanCode.end()) {
decodedText += reverseHuffmanCode[currentBits];
currentBits = ""; // 重置
}
}

return decodedText;
}
int main() {
std::string text = "Huffman coding is a data compression algorithm";

// Step 1: Frequency Count
std::unordered_map<char, int> freqMap;
for (char ch : text) {
freqMap[ch]++;
}

// Step 2: Build the Huffman Tree
Node* root = buildHuffmanTree(freqMap);

// Step 3: Generate Huffman Codes
std::unordered_map<char, std::string> huffmanCode;
generateCodes(root, huffmanCode, "");

// Display the Huffman codes
std::cout << "Huffman Codes:n";
for (auto pair : huffmanCode) {
std::cout << pair.first << ": " << pair.second << 'n';
}

// Step 4: Encode the input string
std::string encodedString = encode(text, huffmanCode);
std::cout << "nEncoded String: " << encodedString << 'n';
writeBitStreamToFile(encodedString, "out.bin");

// Step 5 (Optional): Decode the encoded string
std::unordered_map<std::string, char> rehaffman;
for (auto it : huffmanCode) {
rehaffman.insert(std::make_pair(it.second, it.first));
}
std::string newstring = decodeFromFile("out.bin", rehaffman);

//std::string decodedString = decode(encodedString, root);
std::cout << "nDecoded String: " << newstring << "n";

return 0;
}

Huffman Codes: a: 011 i: 000 g: 0010 m: 1000 e: 00110 l: 00111 s: 0101 c: 0100 n: 1001 : 101 o: 1100 f: 11010 t: 11011 H: 111000 p: 111001 r: 11101 d: 11110 h: 111110 u: 111111

Encoded String: 1110001111111101011010100001110011010100110011110000100100101010000101101011101111100111101101110101001100100011100111101001100101010100011001001101011001110010110011101000110111111101000

Decoded String: Huffman coding is a data compression algorithmi

E3 FD 6A 1C D4 CF 09 2A 16 BB E7 B7 53 23 9E 99 54 64 D6 72 CE 8D FD 00

map/unordered_map

Underlying Data Structure:

map: Implemented as a self-balancing binary search tree (typically a Red-Black tree). This means that the elements are stored in a sorted order based on the keys.
unordered_map: Implemented as a hash table. Elements are stored in no particular order, but operations (insertion, lookup) are generally faster.

Time Complexity:

map: All operations (insertion, deletion, search) take O(log⁡n) time due to the tree structure.
unordered_map: Average time complexity for most operations is O(1) because of the hash table. However, in the worst case (due to hash collisions), it can degrade to O(n).

Order of Elements:

map: Elements are stored in sorted order based on their keys, which can be useful when you need sorted data.
unordered_map: There is no guaranteed order of the elements.

Memory Usage:

map: Uses more memory because of the tree structure overhead (pointers to left and right child nodes).
unordered_map: Uses extra memory for maintaining the hash table and handling hash collisions (e.g., chaining or open addressing).

When to Use map:

When you need the keys to be sorted (e.g., iterating over keys in order).
When the cost of log-time complexity is acceptable, and you need stable performance.
When insertion order or lookup order matters.

When to Use unordered_map:

When you don't need the keys to be sorted.
When you prioritize faster access (constant time average performance).
When you are working with large datasets and need efficient lookups.

find/[]/at

find:判断find的返回结果，才知道有没有
[]:不管有没有，都是有。没有就是0，字符串就是空（自动创建）
at:如果存在，则返回它的值，如果不存在，则抛出异常

Zstandard

api

/*! ZSTD_compress() :
 *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
 *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
 *  @return : compressed size written into `dst` (<= `dstCapacity),
 *            or an error code if it fails (which can be tested using ZSTD_isError()). */
ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
                            const void* src, size_t srcSize,
                                  int compressionLevel);

/*! ZSTD_decompress() :
*  `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
*  `dstCapacity` is an upper bound of originalSize.
*  If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
*  @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
*            or an errorCode if it fails (which can be tested using ZSTD_isError()). */
ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
const void* src, size_t compressedSize);

ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);

/*!< maximum compressed size in worst case scenario */
ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize);

压缩

#include<iostream>
#include<windows.h>
#include<zstd.h>

int main() {
IN HANDLE hFile;
OUT DWORD FileReadSize;
IN DWORD dwFileSize;
IN PVOID RemoteImageBase;
IN PVOID input;
const char* path = "D:\ant me\Ant Messenger\Ant Messenger.exe";
hFile = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, 0, NULL);
dwFileSize = GetFileSize(hFile, NULL);
input = VirtualAlloc(NULL, dwFileSize, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
ReadFile(hFile, input, dwFileSize, &FileReadSize, NULL);
CloseHandle(hFile);

// 计算压缩后的最大缓冲区大小
size_t compressed_size_bound = ZSTD_compressBound(dwFileSize);
char* compressed_data = new char[compressed_size_bound];

// 压缩数据
size_t compressed_size = ZSTD_compress(compressed_data, compressed_size_bound, input, dwFileSize, 1);
if (ZSTD_isError(compressed_size)) {
std::cerr << "Compression failed: " << ZSTD_getErrorName(compressed_size) << std::endl;
delete[] compressed_data;
return 1;
}

std::cout << "Original size: " << dwFileSize << std::endl;
std::cout << "Compressed size: " << compressed_size << std::endl;
const char* path2 = "D:\ant me\Ant Messenger\coleak.exe";
HANDLE hFile2 = CreateFileA(path2, FILE_ALL_ACCESS, FILE_SHARE_READ, NULL, CREATE_ALWAYS, 0, NULL);
WriteFile(hFile2, compressed_data, compressed_size, &FileReadSize, NULL);

delete[] compressed_data;
return 0;
}

ZSTD_compressBound->ZSTD_compress

解压缩

#include <iostream>
#include <windows.h>
#include <zstd.h>

int main() {
IN HANDLE hFile;
OUT DWORD FileReadSize;
IN DWORD dwFileSize;
IN PVOID RemoteImageBase;
IN PVOID input;

// 打开压缩文件
const char* path = "D:\ant me\Ant Messenger\coleak.exe";
hFile = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, 0, NULL);
dwFileSize = GetFileSize(hFile, NULL); // 获取压缩文件的大小
input = VirtualAlloc(NULL, dwFileSize, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
ReadFile(hFile, input, dwFileSize, &FileReadSize, NULL);
CloseHandle(hFile);

// 先计算解压后所需的最大缓冲区大小
unsigned long long decompressed_size = ZSTD_getFrameContentSize(input, dwFileSize);
if (decompressed_size == ZSTD_CONTENTSIZE_ERROR) {
std::cerr << "Not compressed by zstd!" << std::endl;
return 1;
}
else if (decompressed_size == ZSTD_CONTENTSIZE_UNKNOWN) {
std::cerr << "Original size is unknown!" << std::endl;
return 1;
}

// 分配解压后的数据缓冲区
char* decompressed_data = new char[decompressed_size];

// 解压数据
size_t result = ZSTD_decompress(decompressed_data, decompressed_size, input, dwFileSize);
if (ZSTD_isError(result)) {
std::cerr << "Decompression failed: " << ZSTD_getErrorName(result) << std::endl;
delete[] decompressed_data;
return 1;
}

std::cout << "Decompression successful! Original size: " << decompressed_size << std::endl;

const char* path2 = "D:\ant me\Ant Messenger\coleak2.exe";
HANDLE hFile2 = CreateFileA(path2, FILE_ALL_ACCESS, FILE_SHARE_READ, NULL, CREATE_ALWAYS, 0, NULL);
WriteFile(hFile2, decompressed_data, decompressed_size, &FileReadSize, NULL);
delete[] decompressed_data;
return 0;
}

ZSTD_getFrameContentSize->ZSTD_decompress

压缩等级

它的压缩级别从负5级（最快）到22级（压缩速度最慢，但是压缩比最高）可以调节

上面的介绍来自维基百科，但是实测压缩等级可以设置在区间外，但是相差越远越不明显

-5：压缩速度极快，压缩比低，占用较少的内存
22：非常慢的压缩速度。 最佳压缩比，最大限度减少文件大小，适合数据冷存储。
随着级别的增加，压缩速度减慢，但压缩比增高，内存占用也随着压缩级别的增加而增加。

reference

https://tech.meituan.com/2021/01/07/pack-gzip-zstd-lz4.html
https://learn.microsoft.com/zh-cn/windows/win32/cmpapi/using-the-compression-api
https://zh.wikipedia.org/wiki/Zstandard
https://xwxz.github.io/algorithm-LZ77/

后记

ifstream

Open Modes

std::ios::in: Default mode for ifstream, which opens the file for input.

std::ios::binary: Open in binary mode (useful for non-text files).

std::ios::ate: Move the file pointer to the end of the file after opening.

std::ios::app: Append mode.

std::ios::trunc: If the file exists, its contents are truncated (deleted).

#include <iostream>
#include <fstream>
#include <string>

int main() {
std::ifstream inputFile("a.txt");

if (!inputFile.is_open()) {
std::cerr << "Failed to open file!" << std::endl;
return 1;
}

std::string line;
//while (std::getline(inputFile, line)) {
// std::cout << line << std::endl;
//}
char ch;
while (inputFile.get(ch))
{
std::cout << ch;
}
if (inputFile.eof()) {
std::cout << "End of file reached." << std::endl;
}

else if (inputFile.fail()) {
std::cerr << "Error while reading file!" << std::endl;
}

inputFile.close();
return 0;
}

ofstream

Open Modes

std::ios::out (default for ofstream): Opens the file for writing.

std::ios::app: Opens the file for appending. Data is written to the end of the file.

std::ios::trunc: If the file exists, it is truncated to zero length (default).

std::ios::binary: Opens the file in binary mode (for binary files).

std::ios::ate: Opens the file and moves the file pointer to the end of the file.

#include <fstream>

int main() {
// Opening a file in append mode
std::ofstream outFile("log.txt", std::ios::app);
outFile << "Appending some text.n";
outFile.flush(); // Data is written to the file without closing it
outFile.put('H');   // Write 'H' to the file
outFile.put('e');   // Write 'e' to the file
outFile.put('l');   // Write 'l' to the file
outFile.put('l');   // Write 'l' to the file
outFile.put('o');   // Write 'o' to the file
outFile.put('!');   // Write '!' to the file
outFile.close();

// Opening a file in binary mode
std::ofstream binFile("data.bin", std::ios::binary);
const char c[10] = { 0x10,0x48,0x5f };
binFile.write(c, 10); // Writing raw bytes
binFile.close();

return 0;
}

函数表

Function	Description
`ofstream outFile`	Constructor to create and open a file for writing.
`outFile.is_open()`	Checks if the file was opened successfully.
`outFile << "text"`	Writes text or data to the file.
`outFile.close()`	Closes the file.
`outFile.flush()`	Flushes the buffer to the file.
`outFile.write()`	Writes binary data to the file.
`outFile.fail()`	Checks if a write operation failed.
`outFile.bad()`	Checks for serious errors (e.g., hardware issues).
`outFile.good()`	Checks if the stream is in a good state.

文章首发于：渗透测试安全攻防

原文始发于微信公众号（渗透测试安全攻防）：浅谈Zstandard压缩算法

免责声明:文章中涉及的程序(方法)可能带有攻击性，仅供安全研究与教学之用，读者将其信息做其他用途，由读者承担全部法律及连带责任，本站不承担任何法律及连带责任；如有问题可邮件联系(建议使用企业邮箱或有效邮箱,避免邮件被拦截，联系方式见首页)，望知悉。

左青龙
微信扫一扫

右白虎
微信扫一扫

浅谈Zstandard压缩算法

LZ77字典匹配

Huffman 编码

Zstandard

reference

后记

web基础知识

网络安全行业术语，您了解多少

渗透测试 | Sign 签名是什么

防御方安全防护体系概述

物理机、虚拟机、裸金属和容器介绍

什么是HTTP慢速攻击？看不见的温水煮青蛙，如何瘫痪你的服务器？

AES128加密算法

密码学初窥

公钥基础设施（PKI）介绍

100 个 Web 应用漏洞类型挖掘

发表评论