Dex文件解析

jiezi

5 年前

typedef struct {u1  magic[MAGIC_LENGTH];           /* includes version number */
    u4  checksum;           /* adler32 校验剩余长度的文件 */
    u1  signature[kSHA1DigestLen]; /* SHA-1 文件签名 */
    u4  fileSize;           /* length of entire file */
    u4  headerSize;         /* offset to start of next section */
    u4  endianTag;
    u4  linkSize;
    u4  linkOff;
    u4  mapOff;
    u4  stringIdsSize;   // 字符串表大小 偏移
    u4  stringIdsOff;
    u4  typeIdsSize;    // 类型表 大小偏移
    u4  typeIdsOff;
    u4  protoIdsSize;   // 原型表 大小 偏移
    u4  protoIdsOff;
    u4  fieldIdsSize;   // 字段表 大小 偏移
    u4  fieldIdsOff;
    u4  methodIdsSize;   // 函数表 大小 偏移
    u4  methodIdsOff;
    u4  classDefsSize;  // 类定义表 大小  偏移
    u4  classDefsOff;
    u4  dataSize;   // 数据段 大小 偏移
    u4  dataOff;
}DexHeader;

DexHeader 由于是定长结构直接格式化就好

每个 LEB128 由 1 到 5 个字节组成，所有字节组合到一起代表一个 32 位值。除了最后一个字节的最高标志位为 0，

其它的为 1. 剩下的 7 位为有效负荷，第二个字节的 7 位接上。有符号 LEB128 的符号由最后字节的有效负荷最高位决定。

例如：0x7f80

01111111 10000000

按无符号 leb128 解析 0x3f80
按有符号 leb128 解析 -128（注意先转补码）

具体解析算法在示例代码中

字符串表包含了 dex 文件 / 代码中使用到的字符串

字符串表存放的是 StringId，具体字符串值在数据段 data 中

typedef struct {u4 stringDataOff;      /* string_data_item 偏移 */}DexStringId;

struct string_data_item {
    u2 uleb128; // 字符串长度
    u1 str[1];  // 字符串内容
}

string_data_item 起始 2 字节是 uleb128 编码，解码后可得到字符串的长度

typedef struct {u4  descriptorIdx;      /* 指向一个 string_id 的 index */}DexTypeId;

typedef struct {
    u2  classIdx;           /* index into typeIds list for defining class */
    u2  typeIdx;            /* index into typeIds for field type */
    u4  nameIdx;            /* index into stringIds for field name */
}DexFieldId;

Field 描述的是一个类中的成员变量 / 静态变量

typedef struct {
    u4  shortyIdx;          /* index into stringIds for shorty descriptor */
    u4  returnTypeIdx;      /* index into typeIds list for return type */
    u4  parametersOff;      /* file offset to type_list for parameter types */
}DexProtoId;

Proto 原型描述的是一个函数的返回类型参数类型列表

由于参数可能是多个 parametersOff 指向的是一个 type_list 结构

typedef struct {u2  typeIdx;            /* index into typeIds */}DexTypeItem;


typedef struct {
    u4  size;               /* #of entries in list */
    DexTypeItem list[1];    /* entries */
}DexTypeList;

如果 parametersOff 为 0 表示该函数没有参数

typedef struct {
    u2  classIdx;           /* index into typeIds list for defining class */
    u2  protoIdx;           /* index into protoIds for method prototype */
    u4  nameIdx;            /* index into stringIds for method name */
}DexMethodId;

Method 描述的是函数所在类原型名称

typedef struct{
    u4  classIdx;           /* index into typeIds for this class */
    u4  accessFlags;
    u4  superclassIdx;      /* index into typeIds for superclass */
    u4  interfacesOff;      /* file offset to DexTypeList */
    u4  sourceFileIdx;      /* index into stringIds for source file name */
    u4  annotationsOff;     /* file offset to annotations_directory_item */
    u4  classDataOff;       /* file offset to class_data_item */
    u4  staticValuesOff;    /* file offset to DexEncodedArray */
}DexClassDef;

superclassIdx 为 0 表示父类是 java/lang/Object

interfacesOff/annotationsOff/classDataOff/staticValuesOff 都由可能是 0 表示类中没有该类型的数据，例如一个标记类可能 classDataOff 就会为 0 因为没有定义任何函数 / 字段

sourceFileIdx 可能会是一个无效的 id

#define kDexNoIndex 0xffffffff          /* not a valid index value */

classDataOff 表示类数据的偏移指向的是 class_data 结构

struct class_data{
    u4_uleb128 staticFieldsSize;
    u4_uleb128 instanceFieldsSize;
    u4_uleb128 directMethodsSize;
    u4_uleb128 virtualMethodsSize;
    
    DexField staticFields[staticFieldsSize];
    DexField instanceFields[instanceFieldsSize];
    DexMethod directMethods[directMethodsSize];
    DexMethod virtualMethods[virtualMethodsSize];
}

//encoded field
typedef struct {
    //origin type is uleb128
    u4 fieldIdx;    /* 指向一个字段表里的 index */
    u4 accessFlags;
}DexField;


//encoded method
typedef struct{
    //origin type is uleb128
    u4 methodIdx;    /* 指向一个函数表里的 index */
    u4 accessFlags;
    u4 codeOff;      /* DexCode 偏移 */
}DexMethod;

typedef struct {
    u2  registersSize;  // 代码块内使用到的寄存器个数
    u2  insSize;  // 入参字数
    u2  outsSize; // 出参字数
    u2  triesSize;  //try_catch 个数
    u4  debugInfoOff;       /* file offset to debug info stream */
    u4  insnsSize;          /* 字节码数目 */
    u2  insns[1];   // 字节码内容
    // 下面的内容都是当 triesSize>0 的时候才会出现
    //padding 使 try-handler-table 和 字节码之间 四字节对齐
    /* followed by optional u2 padding */ 
    //try_cat 处理表内容  这里实现的是 class 文件中的 try-handler-table
    /* followed by try_item[triesSize] */
    /* followed by uleb128 handlersSize */
    /* followed by catch_handler_item[handlersSize] */
}DexCode;

dex 字节码的翻译和 class 字节码翻译差不多，对着规范翻译就好

android vm 采用 dex 字节码而不是 class 字节码的优势？

dex 文件由多个 class 文件合并而来，把多个常量池合并到一个常量池，避免了常量冗余，有利于运行时的常量内存共享
加载一个 dex 可以加载多个相互依赖的 class，减少了文件 io
arm cpu 具有较多的通用寄存器，vm 设计基于寄存器的执行流程，会加速函数的传参和执行

DexParserDemo

Bytecode for the Dalvik VM

Dex文件解析

Dex 文件结构

文件头

Leb128 编码

字符串表

类型表

字段表

原型表

函数表

类数据

综述

本文代码

参考文档