目录
- LZ77字典匹配
- Huffman 编码
- Zstandard
- reference
- 后记
LZ77字典匹配
算法步骤
-
设置编码位置为输入流的开始 -
在滑窗的待编码区查找搜索区中的最大匹配字符串 -
如果找到字符串,输出(偏移值, 匹配长度), 窗口向前滑动“匹配长度” -
如果没有找到,输出(0, 0, 待编码区的第一个字符),窗口向前滑动一个单位 -
如果待编码区不为空,回到步骤2
代码实现
#include <iostream>
#include <string>
#include <vector>
#include <utility>
class LZ77 {
public:
LZ77(const std::string& inputStr)
: inputStr(inputStr), searchSize(5), aheadSize(3), windSpiltIndex(0), move(0), notFind(-1) {}
void encoding() {
int step = 0;
std::cout << "Step Position Match Output" << std::endl;
while (!isLookHeadEmpty()) {
winMove();
std::pair<int, int> matchResult = findMaxMatch();
int offset = matchResult.first;
int matchLen = matchResult.second;
setMoveSteps(matchLen);
if (matchLen == 0) {
char nextChar = inputStr[windSpiltIndex];
std::cout << step << " " << windSpiltIndex << " - "
<< "(0,0," << nextChar <<")"<< std::endl;
}
else {
std::string matchStr = inputStr.substr(windSpiltIndex - offset, matchLen);
std::cout << step << " " << windSpiltIndex << " " << matchStr << " "
<< "(" << offset << ", " << matchLen << ")" << std::endl;
}
step++;
}
}
private:
std::string inputStr;
int searchSize;
int aheadSize;
int windSpiltIndex;
int move;
const int notFind;
int getWinEndIndex() const {
return windSpiltIndex + aheadSize;
}
int getWinStartIndex() const {
return windSpiltIndex - searchSize;
}
bool isLookHeadEmpty() const {
return windSpiltIndex + move > inputStr.size() - 1;
}
void winMove() {
windSpiltIndex += move;
}
std::pair<int, int> findMaxMatch() {
int matchLen = 0;
int offset = 0;
int minEdge = calculateMinEdge();
for (int i = windSpiltIndex + 1; i <=minEdge; i++) {
int offsetTemp = searchBufferOffset(i);
if (offsetTemp == notFind) {
return std::make_pair(offset, matchLen);
}
offset = offsetTemp;
matchLen++;
}
return std::make_pair(offset, matchLen);
}
int searchBufferOffset(int i) const {
int searchStart = getWinStartIndex();
int searchEnd = windSpiltIndex;
if (searchEnd < 1) {
return notFind;
}
if (searchStart < 0) {
searchStart = 0;
if (searchEnd == 0) {
searchEnd = 1;
}
}
std::string searchStr = inputStr.substr(searchStart, searchEnd - searchStart);
std::string searchFor = inputStr.substr(windSpiltIndex, i - windSpiltIndex);
size_t findIndex = searchStr.find(searchFor);
if (findIndex == std::string::npos) {
return notFind;
}
return searchStr.size() - findIndex;
}
void setMoveSteps(int matchLen) {
//move = (matchLen == 0) ? 1 : matchLen;
move = std::max(1, matchLen);
}
int calculateMinEdge() const {
return std::min(int (inputStr.size()), getWinEndIndex());
}
};
int main() {
LZ77 lz77("AABCBBABCAB");
lz77.encoding();
return 0;
}
Huffman 编码
#include <iostream>
#include <vector>
#include <queue>
#include <unordered_map>
#include <fstream>
#include <bitset>
// Node structure for the Huffman Tree
struct Node {
char ch;
int freq;
Node* left;
Node* right;
Node(char c, int f) : ch(c), freq(f), left(nullptr), right(nullptr) {}
};
// Compare structure for the priority queue
struct Compare {
bool operator()(Node* left, Node* right) {
return left->freq > right->freq;//定义最小堆
}
};
// Build the Huffman Tree
Node* buildHuffmanTree(const std::unordered_map<char, int>& freqMap) {
std::priority_queue<Node*, std::vector<Node*>, Compare> pq;
// Create a leaf node for each character and add it to the priority queue
for (auto pair : freqMap) {
pq.push(new Node(pair.first, pair.second));
}
// Iterate until the queue contains only one node (the root)
while (pq.size() > 1) {
Node* left = pq.top(); pq.pop();
Node* right = pq.top(); pq.pop();
// Create a new internal node with frequency equal to the sum of the two nodes
Node* internalNode = new Node('�', left->freq + right->freq);
internalNode->left = left;
internalNode->right = right;
pq.push(internalNode);
}
// The remaining node is the root of the Huffman Tree
return pq.top();
}
// Traverse the Huffman Tree and store codes in a map
void generateCodes(Node* root, std::unordered_map<char, std::string>& huffmanCode, std::string code) {
if (!root) return;
// If this is a leaf node, store the character and its code
if (root->ch != '�') {
huffmanCode[root->ch] = code;
}
// Traverse the left and right children
generateCodes(root->left, huffmanCode, code + "0");
generateCodes(root->right, huffmanCode, code + "1");
}
// Encode the input string using the Huffman codes
std::string encode(const std::string& text, const std::unordered_map<char, std::string>& huffmanCode) {
std::string encodedString;
for (char ch : text) {
encodedString += huffmanCode.at(ch);
}
return encodedString;
}
// Decode the encoded string using the Huffman Tree
std::string decode(const std::string& encodedString, Node* root) {
std::string decodedString;
Node* currentNode = root;
for (char bit : encodedString) {
if (bit == '0') {
currentNode = currentNode->left;
}
else {
currentNode = currentNode->right;
}
// If we reach a leaf node, append the character to the decoded string
if (currentNode->left == nullptr && currentNode->right == nullptr) {
decodedString += currentNode->ch;
currentNode = root;
}
}
return decodedString;
}
void freeTree(Node* root) {
if (root == nullptr) {
return;
}
freeTree(root->left);
freeTree(root->right);
delete root;
}
void writeBitStreamToFile(const std::string& bitStream, const std::string& filename) {
std::ofstream outFile(filename, std::ios::binary);
char buffer = 0;
int bitCount = 0;
for (char bit : bitStream) {
buffer <<= 1;
if (bit == '1') {
buffer |= 1;
}
bitCount++;
// 当收集到 8 位时,将字节写入文件
if (bitCount == 8) {
outFile.put(buffer);
buffer = 0;
bitCount = 0;
}
}
// 如果最后不满8位,补齐0再写入
if (bitCount > 0) {
buffer <<= (8 - bitCount); // 将剩余位数补齐到8位
outFile.put(buffer);
}
outFile.close();
}
std::string decodeFromFile(const std::string& filename,std::unordered_map<std::string, char> reverseHuffmanCode)
{
std::ifstream inFile(filename, std::ios::binary);
if (!inFile.is_open()) {
std::cerr << "Error: Cannot open file!" << std::endl;
return "";
}
std::string bitStream = "";
char buffer;
while (inFile.get(buffer)){
std::bitset bites= std::bitset<8>(buffer);
bitStream += bites.to_string();
// 读取每个字节,并将其转换为位串
}
inFile.close();
std::string decodedText = "";
std::string currentBits = "";
for (char bit : bitStream) {
currentBits += bit;
// 一旦找到编码匹配字符,解码该字符
if (reverseHuffmanCode.find(currentBits) != reverseHuffmanCode.end()) {
decodedText += reverseHuffmanCode[currentBits];
currentBits = ""; // 重置
}
}
return decodedText;
}
int main() {
std::string text = "Huffman coding is a data compression algorithm";
// Step 1: Frequency Count
std::unordered_map<char, int> freqMap;
for (char ch : text) {
freqMap[ch]++;
}
// Step 2: Build the Huffman Tree
Node* root = buildHuffmanTree(freqMap);
// Step 3: Generate Huffman Codes
std::unordered_map<char, std::string> huffmanCode;
generateCodes(root, huffmanCode, "");
// Display the Huffman codes
std::cout << "Huffman Codes:n";
for (auto pair : huffmanCode) {
std::cout << pair.first << ": " << pair.second << 'n';
}
// Step 4: Encode the input string
std::string encodedString = encode(text, huffmanCode);
std::cout << "nEncoded String: " << encodedString << 'n';
writeBitStreamToFile(encodedString, "out.bin");
// Step 5 (Optional): Decode the encoded string
std::unordered_map<std::string, char> rehaffman;
for (auto it : huffmanCode) {
rehaffman.insert(std::make_pair(it.second, it.first));
}
std::string newstring = decodeFromFile("out.bin", rehaffman);
//std::string decodedString = decode(encodedString, root);
std::cout << "nDecoded String: " << newstring << "n";
return 0;
}
Huffman Codes: a: 011 i: 000 g: 0010 m: 1000 e: 00110 l: 00111 s: 0101 c: 0100 n: 1001 : 101 o: 1100 f: 11010 t: 11011 H: 111000 p: 111001 r: 11101 d: 11110 h: 111110 u: 111111
Encoded String: 1110001111111101011010100001110011010100110011110000100100101010000101101011101111100111101101110101001100100011100111101001100101010100011001001101011001110010110011101000110111111101000
Decoded String: Huffman coding is a data compression algorithmi
E3 FD 6A 1C D4 CF 09 2A 16 BB E7 B7 53 23 9E 99 54 64 D6 72 CE 8D FD 00
map/unordered_map
-
Underlying Data Structure:
-
map
: Implemented as a self-balancing binary search tree (typically a Red-Black tree). This means that the elements are stored in a sorted order based on the keys. -
unordered_map
: Implemented as a hash table. Elements are stored in no particular order, but operations (insertion, lookup) are generally faster.
-
Time Complexity:
-
map
: All operations (insertion, deletion, search) take O(logn) time due to the tree structure. -
unordered_map
: Average time complexity for most operations is O(1) because of the hash table. However, in the worst case (due to hash collisions), it can degrade to O(n).
-
Order of Elements:
-
map
: Elements are stored in sorted order based on their keys, which can be useful when you need sorted data. -
unordered_map
: There is no guaranteed order of the elements.
-
Memory Usage:
-
map
: Uses more memory because of the tree structure overhead (pointers to left and right child nodes). -
unordered_map
: Uses extra memory for maintaining the hash table and handling hash collisions (e.g., chaining or open addressing).
-
When to Use map
:
-
When you need the keys to be sorted (e.g., iterating over keys in order). -
When the cost of log-time complexity is acceptable, and you need stable performance. -
When insertion order or lookup order matters.
-
When to Use unordered_map
:
-
When you don't need the keys to be sorted. -
When you prioritize faster access (constant time average performance). -
When you are working with large datasets and need efficient lookups.
find/[]/at
-
find:判断find的返回结果,才知道有没有 -
[]:不管有没有,都是有。没有就是0,字符串就是空(自动创建) -
at:如果存在,则返回它的值,如果不存在,则抛出异常
Zstandard
api
/*! ZSTD_compress() :
* Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
* Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`.
* @return : compressed size written into `dst` (<= `dstCapacity),
* or an error code if it fails (which can be tested using ZSTD_isError()). */
ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
const void* src, size_t srcSize,
int compressionLevel);
/*! ZSTD_decompress() :
* `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
* `dstCapacity` is an upper bound of originalSize.
* If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
* @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
* or an errorCode if it fails (which can be tested using ZSTD_isError()). */
ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
const void* src, size_t compressedSize);
ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
/*!< maximum compressed size in worst case scenario */
ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize);
压缩
#include<iostream>
#include<windows.h>
#include<zstd.h>
int main() {
IN HANDLE hFile;
OUT DWORD FileReadSize;
IN DWORD dwFileSize;
IN PVOID RemoteImageBase;
IN PVOID input;
const char* path = "D:\ant me\Ant Messenger\Ant Messenger.exe";
hFile = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, 0, NULL);
dwFileSize = GetFileSize(hFile, NULL);
input = VirtualAlloc(NULL, dwFileSize, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
ReadFile(hFile, input, dwFileSize, &FileReadSize, NULL);
CloseHandle(hFile);
// 计算压缩后的最大缓冲区大小
size_t compressed_size_bound = ZSTD_compressBound(dwFileSize);
char* compressed_data = new char[compressed_size_bound];
// 压缩数据
size_t compressed_size = ZSTD_compress(compressed_data, compressed_size_bound, input, dwFileSize, 1);
if (ZSTD_isError(compressed_size)) {
std::cerr << "Compression failed: " << ZSTD_getErrorName(compressed_size) << std::endl;
delete[] compressed_data;
return 1;
}
std::cout << "Original size: " << dwFileSize << std::endl;
std::cout << "Compressed size: " << compressed_size << std::endl;
const char* path2 = "D:\ant me\Ant Messenger\coleak.exe";
HANDLE hFile2 = CreateFileA(path2, FILE_ALL_ACCESS, FILE_SHARE_READ, NULL, CREATE_ALWAYS, 0, NULL);
WriteFile(hFile2, compressed_data, compressed_size, &FileReadSize, NULL);
delete[] compressed_data;
return 0;
}
ZSTD_compressBound->ZSTD_compress
解压缩
#include <iostream>
#include <windows.h>
#include <zstd.h>
int main() {
IN HANDLE hFile;
OUT DWORD FileReadSize;
IN DWORD dwFileSize;
IN PVOID RemoteImageBase;
IN PVOID input;
// 打开压缩文件
const char* path = "D:\ant me\Ant Messenger\coleak.exe";
hFile = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, 0, NULL);
dwFileSize = GetFileSize(hFile, NULL); // 获取压缩文件的大小
input = VirtualAlloc(NULL, dwFileSize, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
ReadFile(hFile, input, dwFileSize, &FileReadSize, NULL);
CloseHandle(hFile);
// 先计算解压后所需的最大缓冲区大小
unsigned long long decompressed_size = ZSTD_getFrameContentSize(input, dwFileSize);
if (decompressed_size == ZSTD_CONTENTSIZE_ERROR) {
std::cerr << "Not compressed by zstd!" << std::endl;
return 1;
}
else if (decompressed_size == ZSTD_CONTENTSIZE_UNKNOWN) {
std::cerr << "Original size is unknown!" << std::endl;
return 1;
}
// 分配解压后的数据缓冲区
char* decompressed_data = new char[decompressed_size];
// 解压数据
size_t result = ZSTD_decompress(decompressed_data, decompressed_size, input, dwFileSize);
if (ZSTD_isError(result)) {
std::cerr << "Decompression failed: " << ZSTD_getErrorName(result) << std::endl;
delete[] decompressed_data;
return 1;
}
std::cout << "Decompression successful! Original size: " << decompressed_size << std::endl;
const char* path2 = "D:\ant me\Ant Messenger\coleak2.exe";
HANDLE hFile2 = CreateFileA(path2, FILE_ALL_ACCESS, FILE_SHARE_READ, NULL, CREATE_ALWAYS, 0, NULL);
WriteFile(hFile2, decompressed_data, decompressed_size, &FileReadSize, NULL);
delete[] decompressed_data;
return 0;
}
ZSTD_getFrameContentSize->ZSTD_decompress
压缩等级
它的压缩级别从负5级(最快)到22级(压缩速度最慢,但是压缩比最高)可以调节
上面的介绍来自维基百科,但是实测压缩等级可以设置在区间外,但是相差越远越不明显
-5:压缩速度极快,压缩比低,占用较少的内存
22:非常慢的压缩速度。 最佳压缩比,最大限度减少文件大小,适合数据冷存储。
随着级别的增加,压缩速度减慢,但压缩比增高,内存占用也随着压缩级别的增加而增加。
reference
https://tech.meituan.com/2021/01/07/pack-gzip-zstd-lz4.html
https://learn.microsoft.com/zh-cn/windows/win32/cmpapi/using-the-compression-api
https://zh.wikipedia.org/wiki/Zstandard
https://xwxz.github.io/algorithm-LZ77/
后记
ifstream
Open Modes
std::ios::in
: Default mode for ifstream
, which opens the file for input.
std::ios::binary
: Open in binary mode (useful for non-text files).
std::ios::ate
: Move the file pointer to the end of the file after opening.
std::ios::app
: Append mode.
std::ios::trunc
: If the file exists, its contents are truncated (deleted).
#include <iostream>
#include <fstream>
#include <string>
int main() {
std::ifstream inputFile("a.txt");
if (!inputFile.is_open()) {
std::cerr << "Failed to open file!" << std::endl;
return 1;
}
std::string line;
//while (std::getline(inputFile, line)) {
// std::cout << line << std::endl;
//}
char ch;
while (inputFile.get(ch))
{
std::cout << ch;
}
if (inputFile.eof()) {
std::cout << "End of file reached." << std::endl;
}
else if (inputFile.fail()) {
std::cerr << "Error while reading file!" << std::endl;
}
inputFile.close();
return 0;
}
ofstream
Open Modes
std::ios::out
(default for ofstream
): Opens the file for writing.
std::ios::app
: Opens the file for appending. Data is written to the end of the file.
std::ios::trunc
: If the file exists, it is truncated to zero length (default).
std::ios::binary
: Opens the file in binary mode (for binary files).
std::ios::ate
: Opens the file and moves the file pointer to the end of the file.
#include <fstream>
int main() {
// Opening a file in append mode
std::ofstream outFile("log.txt", std::ios::app);
outFile << "Appending some text.n";
outFile.flush(); // Data is written to the file without closing it
outFile.put('H'); // Write 'H' to the file
outFile.put('e'); // Write 'e' to the file
outFile.put('l'); // Write 'l' to the file
outFile.put('l'); // Write 'l' to the file
outFile.put('o'); // Write 'o' to the file
outFile.put('!'); // Write '!' to the file
outFile.close();
// Opening a file in binary mode
std::ofstream binFile("data.bin", std::ios::binary);
const char c[10] = { 0x10,0x48,0x5f };
binFile.write(c, 10); // Writing raw bytes
binFile.close();
return 0;
}
函数表
Function | Description |
---|---|
ofstream outFile |
Constructor to create and open a file for writing. |
outFile.is_open() |
Checks if the file was opened successfully. |
outFile << "text" |
Writes text or data to the file. |
outFile.close() |
Closes the file. |
outFile.flush() |
Flushes the buffer to the file. |
outFile.write() |
Writes binary data to the file. |
outFile.fail() |
Checks if a write operation failed. |
outFile.bad() |
Checks for serious errors (e.g., hardware issues). |
outFile.good() |
Checks if the stream is in a good state. |
文章首发于:渗透测试安全攻防
原文始发于微信公众号(渗透测试安全攻防):浅谈Zstandard压缩算法
- 左青龙
- 微信扫一扫
- 右白虎
- 微信扫一扫
评论