创建: 2023-05-10 16:43
更新: 2023-05-12 11:29
https://scz.617.cn/python/202305101643.txt
☆ 背景介绍
☆ pycdc源码浅析
☆ 部分增加/修改的指令示例
1) CALL_FUNCTION_KW
2) CALL_FUNCTION_EX/DICT_MERGE
3) SETUP_FINALLY
4) JUMP_IF_NOT_EXC_MATCH
5) RERAISE
☆ 后记
☆ 背景介绍
pycdc是款C++开发的开源Python反编译器,对Python 3.9有部分支持:
Decompyle++ A Python Byte-code Disassembler/Decompiler
https://github.com/zrax/pycdc
有几次用到它,小打小闹修补过。我不会C++,本文属于备忘笔记,必有谬误,只可借鉴,不可全信。
☆ pycdc源码浅析
查看ASTree.cpp
void decompyle(PycRef<PycCode> code, PycModule* mod)
{
/*
* 在此组织反编译结果
*/
PycRef<ASTNode> source = BuildFromCode(code, mod);
...
/*
* 在此输出反编译结果
*
* NODE_NODELIST (1)
*/
print_src(source, mod);
...
}
void print_src(PycRef<ASTNode> node, PycModule* mod)
{
...
switch (node->type()) {
case ASTNode::NODE_BLOCK:
{
...
/*
* 在此输出try、except这类关键字
*/
fprintf(pyc_output, "%s", blk->type_str());
...
} else if (blk->blktype() == ASTBlock::BLK_EXCEPT &&
blk.cast<ASTCondBlock>()->cond() != NULL) {
fputs(" ", pyc_output);
/*
* 在此输出"except KeyboardInterrupt"中的"KeyboardInterrupt"
*/
print_src(blk.cast<ASTCondBlock>()->cond(), mod);
} else if (blk->blktype() == ASTBlock::BLK_WITH) {
...
fputs(":n", pyc_output);
cur_indent++;
print_block(blk, mod);
cur_indent--;
}
break;
...
}
查看ASTNode.h
有两种重要数据类型,ASTNode、ASTBlock。ASTNode有如下类型
NODE_INVALID, NODE_NODELIST, NODE_OBJECT, NODE_UNARY, NODE_BINARY,
NODE_COMPARE, NODE_SLICE, NODE_STORE, NODE_RETURN, NODE_NAME,
NODE_DELETE, NODE_FUNCTION, NODE_CLASS, NODE_CALL, NODE_IMPORT,
NODE_TUPLE, NODE_LIST, NODE_SET, NODE_MAP, NODE_SUBSCR, NODE_PRINT,
NODE_CONVERT, NODE_KEYWORD, NODE_RAISE, NODE_EXEC, NODE_BLOCK,
NODE_COMPREHENSION, NODE_LOADBUILDCLASS, NODE_AWAITABLE,
NODE_FORMATTEDVALUE, NODE_JOINEDSTR, NODE_CONST_MAP,
NODE_ANNOTATED_VAR, NODE_CHAINSTORE, NODE_TERNARY
ASTBlock有如下类型
BLK_MAIN, BLK_IF, BLK_ELSE, BLK_ELIF, BLK_TRY,
BLK_CONTAINER, BLK_EXCEPT, BLK_FINALLY,
BLK_WHILE, BLK_FOR, BLK_WITH, BLK_ASYNCFOR
有多种ASTNode,反编译结果以ASTNode为基本单位进行组织。有些ASTNode简单,比如ASTReturn,有些ASTNode复杂,比如ASTBlock。ASTReturn、ASTBlock都是ASTNode的派生类。ASTBlock对应try块、except块这些。
RERAISE_test.py
def func () :
try :
x = 51201314
except :
pass
RERAISE_test.pycdump.asm
11 0 SETUP_FINALLY 8 (to 10)
12 2 LOAD_CONST 1 (51201314)
4 STORE_FAST 0 (x)
6 POP_BLOCK
8 JUMP_FORWARD 12 (to 22)
13 >> 10 POP_TOP
12 POP_TOP
14 POP_TOP
14 16 POP_EXCEPT
18 JUMP_FORWARD 2 (to 22)
20 RERAISE
>> 22 LOAD_CONST 0 (None)
24 RETURN_VALUE
组织反编译结果时,大致如此
NODE_NODELIST (1)
NODE_BLOCK (25) - BLK_CONTAINER (5) // func()
NODE_BLOCK (25) - BLK_TRY (4) // try
NODE_BLOCK (25) - BLK_EXCEPT (6) // except
NODE_RETURN (8) // return
最终的反编译输出
def func():
try:
x = 51201314
except:
pass
return None
BuildFromCode()组织反编译结果,具体解析每条指令
PycRef<ASTNode> BuildFromCode(PycRef<PycCode> code, PycModule* mod)
{
...
while (!source.atEof()) {
...
/*
* 修改pos,指向下一条待处理指令
*/
bc_next(source, mod, opcode, operand, pos);
...
switch (opcode) {
...
case Pyc::SETUP_FINALLY_A:
{
...
}
break;
...
default:
fprintf(stderr, "Unsupported opcode: %sn", Pyc::OpcodeName(opcode & 0xFF));
cleanBuild = false;
return new ASTNodeList(defblock->nodes());
}
...
}
if (stack_hist.size()) {
fputs("Warning: Stack history is not empty!n", stderr);
while (stack_hist.size()) {
stack_hist.pop();
}
}
if (blocks.size() > 1) {
fputs("Warning: block stack is not empty!n", stderr);
while (blocks.size() > 1) {
PycRef<ASTBlock> tmp = blocks.top();
blocks.pop();
blocks.top()->append(tmp.cast<ASTNode>());
}
}
cleanBuild = true;
return new ASTNodeList(defblock->nodes());
}
"pycdc some.pyc"若输出"Unsupported opcode: XXX",表示XXX指令未被支持,该警告由BuildFromCode()发出。
"Unsupported opcode: XXX"中的"XXX"就是opname,假设是带参指令,"XXX"不包含后缀"_A",但switch/case中带参指令有"_A"后缀。
对"Unsupported opcode: XXX"的修补在BuildFromCode()的switch/case中进行。
BuildFromCode()内部维护名为blocks[]的栈,用于存放解析后组织出来的ASTBlock。
☆ 部分增加/修改的指令示例
pycdc未支持的Python 3.9指令不少;还有一些指令貌似支持,但实际对标更低版本Python,并不对标3.9;这些都需要处理,此间只记录部分。
1) CALL_FUNCTION_KW
在ASTree.cpp中搜
case Pyc::CALL_FUNCTION_A
case Pyc::CALL_FUNCTION_KW_A
原实现对标3.5及更早版本,与3.9不兼容,参看
https://docs.python.org/3.4/library/dis.html
https://docs.python.org/3.9/library/dis.html
"case Pyc::CALL_FUNCTION_A"事实上可以不修改,因为operand不会很大,此时kwparams为0,测3.9也行。
"case Pyc::CALL_FUNCTION_KW_A"需要修改,下面是简单示例
case Pyc::CALL_FUNCTION_KW_A :
{
ASTCall::kwparam_t kwparamList;
ASTCall::pparam_t pparamList;
/*
* 当前指令的参数有两部分来源,一是operand,二是stack
*/
PycRef<ASTNode> kw = stack.top();
/*
* top()接pop()相当于汇编的弹栈取值,top()只取值不弹栈
*/
stack.pop();
/*
* 参看ASTNode.h,其中定义了各种NODE_*
*/
if ( kw.type() != ASTNode::NODE_OBJECT )
{
fprintf( stderr, "Unsupported argument found for CALL_FUNCTION_KWn" );
break;
}
/*
* 参看ASTNode.h,ASTObject是ASTNode子类,有各种各样的ASTNode子类
*/
PycRef<PycObject> obj = kw.cast<ASTObject>()->object();
/*
* 为什么是TYPE_SMALL_TUPLE,看Python字节码反汇编结果加强理解
*/
if ( obj->type() != PycObject::TYPE_SMALL_TUPLE )
{
fprintf( stderr, "Unsupported argument type found for CALL_FUNCTION_KWn" );
break;
}
std::vector<PycRef<PycObject>> karray = obj.cast<PycTuple>()->values();
/*
* 在此逆序,以方便后续组织kwparamList
*/
std::reverse( karray.begin(), karray.end() );
/*
* 3.6及以上版本,获取kwparams的方式不同于低版本
*/
int kwparams = (int)karray.size();
int pparams = operand - kwparams;
for ( auto& k : karray )
{
PycRef<ASTNode> val = stack.top();
stack.pop();
PycRef<ASTNode> key = new ASTObject( k );
kwparamList.push_front( std::make_pair( key, val ) );
}
for ( int i = 0; i < pparams; i++ )
{
/*
* the right-most positional parameter on top
*/
pparamList.push_front( stack.top() );
stack.pop();
}
/*
* Below the arguments is a callable object to call.
*/
PycRef<ASTNode> func = stack.top();
stack.pop();
PycRef<ASTNode> call = new ASTCall( func, pparamList, kwparamList );
// call.cast<ASTCall>()->setKW( kw );
/*
* 不着急输出,先压栈,以后随ASTBlock一起输出
*/
stack.push( call );
}
break;
CALL_FUNCTION_KW_test.py
import dis
def func () :
func_4( 0, b=1 )
dis.dis( func )
func()的反汇编
11 0 LOAD_GLOBAL 0 (func_4)
2 LOAD_CONST 1 (0)
4 LOAD_CONST 2 (1)
6 LOAD_CONST 3 (('b',))
8 CALL_FUNCTION_KW 2
10 POP_TOP
12 LOAD_CONST 0 (None)
14 RETURN_VALUE
假设读者具有Python字节码功底,不多解释。
2) CALL_FUNCTION_EX/DICT_MERGE
CALL_FUNCTION_EX_test.py
import dis
def func () :
func_6_0( 0, **{"a":1, "b":2} )
func_6_1( *[1, 2, 3] )
dis.dis( func )
11 0 LOAD_GLOBAL 0 (func_6_0)
2 LOAD_CONST 6 ((0,))
4 BUILD_MAP 0
6 LOAD_CONST 2 (1)
8 LOAD_CONST 3 (2)
10 LOAD_CONST 4 (('a', 'b'))
12 BUILD_CONST_KEY_MAP 2
14 DICT_MERGE 1
16 CALL_FUNCTION_EX 1
18 POP_TOP
12 20 LOAD_GLOBAL 1 (func_6_1)
22 BUILD_LIST 0
24 LOAD_CONST 5 ((1, 2, 3))
26 LIST_EXTEND 1
28 CALL_FUNCTION_EX 0
30 POP_TOP
32 LOAD_CONST 0 (None)
34 RETURN_VALUE
DICT_MERGE、CALL_FUNCTION_EX未被支持,需要增加
/*
* DICT_MERGE(i) calls dict.update(TOS1[-i], TOS), raises an exception for
* duplicate keys. Used to build dicts. New in version 3.9.
*/
case Pyc::DICT_MERGE_A :
{
/*
* 简单处理operand,暂不考虑非1情形
*/
if ( operand != 1 )
{
fprintf( stderr, "Unsupported operand found for DICT_MERGEn" );
break;
}
PycRef<ASTNode> dict = stack.top();
stack.pop();
if ( dict.type() != ASTNode::NODE_CONST_MAP )
{
fprintf( stderr, "Unsupported argument found for DICT_MERGEn" );
break;
}
/*
* 本来是dict.update(TOS1[-i], TOS),但暂时只处理i=1的情形
*/
stack.pop();
stack.push( dict );
}
break;
case Pyc::CALL_FUNCTION_EX_A :
{
ASTCall::kwparam_t kwparamList;
ASTCall::pparam_t pparamList;
PycRef<ASTNode> kw;
if ( ( operand & 0x1 ) != 0 )
{
kw = stack.top();
stack.pop();
if ( kw.type() != ASTNode::NODE_CONST_MAP )
{
fprintf( stderr, "Unsupported argument found for CALL_FUNCTION_EXn" );
break;
}
}
PycRef<ASTNode> var = stack.top();
stack.pop();
if ( var.type() == ASTNode::NODE_OBJECT )
{
PycRef<PycObject> obj = var.cast<ASTObject>()->object();
if ( obj->type() != PycObject::TYPE_SMALL_TUPLE )
{
fprintf( stderr, "Unsupported argument type found for CALL_FUNCTION_EXn" );
break;
}
std::vector<PycRef<PycObject>> varray = obj.cast<PycTuple>()->values();
for ( auto& v : varray )
{
PycRef<ASTNode> val = new ASTObject( v );
pparamList.push_front( val );
}
}
PycRef<ASTNode> func = stack.top();
stack.pop();
PycRef<ASTNode> call = new ASTCall( func, pparamList, kwparamList );
// fprintf( stderr, "var.type()=%dn", var.type() );
// NODE_OBJECT(2)
// NODE_LIST(16)
if ( var.type() != ASTNode::NODE_OBJECT )
{
/*
* 对应"*[1, 2, 3]"这种参数
*/
call.cast<ASTCall>()->setVar( var );
}
if ( ( operand & 0x1 ) != 0 )
{
/*
* 对应"**{"a":1, "b":2}"这种参数
*/
call.cast<ASTCall>()->setKW( kw );
}
stack.push( call );
}
break;
3) SETUP_FINALLY
3.9没有SETUP_EXCEPT,全部揉进SETUP_FINALLY。在ASTree.cpp中合并这两个case
case Pyc::SETUP_EXCEPT_A
case Pyc::SETUP_FINALLY_A
只是应急方案,有一堆后遗症。
4) JUMP_IF_NOT_EXC_MATCH
case Pyc::JUMP_IF_NOT_EXC_MATCH_A :
{
PycRef<ASTNode> name = stack.top();
stack.pop();
stack.pop();
/*
* 并不完备,仅为示例,组织"except KeyboardInterrupt"中的
* "KeyboardInterrupt"
*/
if ( name->type() == ASTNode::NODE_NAME && !blocks.empty() && blocks.top()->blktype() == ASTBlock::BLK_EXCEPT )
{
PycRef<ASTBlock> prev = blocks.top();
blocks.pop();
PycRef<ASTBlock> next = new ASTCondBlock( prev->blktype(), prev->end(), name, prev.cast<ASTCondBlock>()->negative() );
next->init();
blocks.push( next );
curblock = blocks.top();
}
}
break;
pycdc需做大量模式匹配,基于经验结论组织反编译结果,JUMP_IF_NOT_EXC_MATCH的前导指令不一定是LOAD_NAME,上例只处理了这一种情形。
5) RERAISE
case Pyc::RERAISE :
{
// stack.pop();
// stack.pop();
// stack.pop();
PycRef<ASTBlock> prev = blocks.top();
/*
* 并不完备,仅为示例
*/
if
(
!blocks.empty() &&
prev->nodes().size() &&
prev->nodes().back().type() == ASTNode::NODE_BLOCK &&
prev->nodes().back().cast<ASTBlock>()->blktype() == ASTBlock::BLK_EXCEPT
)
{
blocks.pop();
blocks.top()->append( prev.cast<ASTNode>() );
}
}
break;
一上来就弹了三次栈,这个操作可能不对,毕竟这是反编译过程,而非执行过程,反编译时POP_EXCEPT就啥也没干,但实在懒得测试各种情形,将就对付吧。
☆ 后记
pycdc将各种Python版本的指令放在一起处理,这样干的坏处太多。比如某指令在几个Python版本之间发生变化,pycdc很可能未测试到此情形,但又不发出警告,最后反编译结果一乱糟,很难定位root cause。pycdc已为向后兼容性所拖累,其代码可维护性越来越差,感觉作者只是在垂死挣扎。
开发Python反编译器,最大困难是CFG模式识别,纯体力活,需要精心准备各种测试用例,一不留神就覆盖不到,这得是靠爱发电的人干的事儿。
我没有大修过pycdc。
原文始发于微信公众号(青衣十三楼飞花堂):针对Python 3.9修改pycdc源码示例
- 左青龙
- 微信扫一扫
-
- 右白虎
- 微信扫一扫
-
评论