2009年5月2日星期六

蛋痛产物:用Python读erlang字节码

连着断网两天啊,在宿舍闲着(NND管机房的老师都出去旅游了,结果看着机房里的老鼠在那里咬光纤就是进不去……生气ing)
闲着蛋痛(Workspace整个都在服务器上,光纤一断什么都干不了)与是吧很久以前的工作干完了,于是有了下面这个玩意:
有点类似javap的感觉的一个玩意儿,就是纯粹的读取beam文件然后打印结果,主要是为了熟悉beam字节码的结构,说起来这玩意可真要命,几乎找不到任何文档说明,只能去看源代码,所以放寒假的时候曾经研究过一段时间,但是最后放弃了OTL。这几天是实在蛋痛才拿出来鼓捣。而且到最后关于字节码的行为和定义还是没弄懂(erts在加载字节码时还要变形……而且要命的是这玩意儿是基于寄存器的而不是基于堆栈的,弄起来不是特别顺手- -)
下面是主要代码,python3.0通过:

<[[CDATA--
from chunk import Chunk
from struct import unpack_from
from zlib import decompress
from beam.beam_ops import *
from io import BytesIO

input = open('a.beam','rb')
beam = Chunk(input)
print("Name=%s Size=%x"%(beam.getname(),beam.getsize()))
beam_head = beam.read(4)
print("BeamHeader=%s"%beam_head)
chunks = {}
while True:
try:
ch = Chunk(beam,align=False)
except EOFError:
break
print("\tName=%s Size=%x"%(ch.getname(),ch.getsize()))
skip = input.tell()+4*int((ch.getsize()+3)/4);#for 4 bytes align
chunks[ch.getname()]=ch.read()
input.seek(skip)

input.close()

atoms = [b'a']
if chunks[b"Atom"]:
print("Atom Table:")
data = chunks[b"Atom"]
count = unpack_from('>L', data[0:4])[0]
ptr = 4
print("\tCount=%d"%count)
for i in range(count):
name_len = data[ptr]
ptr = ptr+1
name = data[ptr:ptr+name_len]
ptr = ptr+name_len
atoms.append(name)
print("\tAtom[%d]=%s"%(i+1,name))

imports=[]
if chunks[b"ImpT"]:
print("Import Table:")
data = chunks[b"ImpT"]
count = unpack_from('>L', data[0:4])[0]
print("\tCount=%d"%count)
for i in range(count):
x = unpack_from('>LLL', data[i*12+4:i*12+16])
module_idx = x[0]
name_idx = x[1]
arity = x[2]
imports.append(x)
print("\tImport[%d]=%s:%s/%d"%(i,atoms[module_idx],atoms[name_idx],arity))

exports=[]
if chunks[b"ExpT"]:
print("Export Table:")
data = chunks[b"ExpT"]
count = unpack_from('>L', data[0:4])[0]
print("\tCount=%d"%count)
for i in range(count):
x = unpack_from('>LLL', data[i*12+4:i*12+16])
name_idx = x[0]
arity = x[1]
lable_idx = x[2]
exports.append(x)
print("\tExport[%d]=%s/%d@%d"%(i,atoms[name_idx],arity,lable_idx))

literals=[]
if chunks[b"LitT"]:
zdata = chunks[b"LitT"]
zlen = len(zdata)
size = unpack_from('>L', zdata[0:4])[0]
data = decompress(zdata[4:])
ulen = len(data)
print("Literal Table: {Compressed=%d Uncompressed=%d Decleared=%d}"%(zlen,ulen,size))
count = unpack_from('>L', data[0:4])[0]
ptr = 4
print("\tCount=%d"%count)
for i in range(count):
lit_len = unpack_from('>L', data[ptr:ptr+4])[0]
ptr = ptr+4
x = unpack_from('>Bc', data[ptr:ptr+2])
VERSION_MAGIC = x[0]
EXT_TAG = x[1]
if EXT_TAG==b'k':
value = data[ptr+4:ptr+lit_len]
else:
value = data[ptr+2:ptr+lit_len]
ptr = ptr+lit_len
literals.append(value)
print("\tLiteral[%d]={v:%d,t:%s} %s"%(i,VERSION_MAGIC,op_external_tags[EXT_TAG],value))

lambdas=[]
if chunks[b"FunT"]:
print("Lambda Table:")
data = chunks[b"FunT"]
count = unpack_from('>L', data[0:4])[0]
print("\tCount=%d"%count)
for i in range(count):
off = i*24+4
x = unpack_from('>LLLLLL', data[off:off+24])
function_index = x[0]
arity = x[1]
lable_index = x[2]
index = x[3]
num_free = x[4]
old_uniq = x[5]
exports.append(x)
print("\tLambda[%d]=%s/%d@%d %d %d %d"%(i,atoms[function_index],arity,lable_idx,index,num_free,old_uniq))

codes=[]
if chunks[b"Code"]:
print("Code Section:")
data = chunks[b"Code"]
x = unpack_from('>LLLLL', data[0:20])
code_len = x[0]
code_ver = x[1]
code_max = x[2]
num_lables = x[3]
num_functions = x[4]
print("\tCodeLen=%d"%code_len)
print("\tCodeVer=%d"%code_ver)
print("\tCodeMax=%x"%code_max)
print("\tLables=%d"%num_lables)
print("\tFunctions=%d"%num_functions)
data=data[20:]
dlen = len(data)
input = BytesIO(data)
while input.tell() off = input.tell()
op = input.read(1)[0];
arity = op_arities[op];
args = []
for j in range(arity):
args.append(read_arg(input,atoms=atoms,literals=literals))
codes.append((op,args))
print("\tCodeOffset[%04X]=%s %s"%(off,op_names[op],args))
--]]>

然后顺便还从erts里面提取了一些字节码定义数据,又臭又长,不贴了,放地址:
py.rar

2 条评论: