lua解析脚本过程中的关键数据结构介绍

在这一篇文章中我先来介绍一下lua解析一个脚本文件时要用到的一些关键的数据结构，为将来的一系列代码分析打下一个良好的基础。在整个过程中，比较重要的几个源码文件分别是：llex.h，lparse.h、lobject.h和lopcode.h。

在llex.h中

 typedef struct Token {

   int token;

   SemInfo seminfo;

 } Token;

Token代表了一个词法单元，其中token表示词法类型如TK_NAME、TK_NUMBER等如果不是这些类型则存放则词素的字符表示，例如分析的代码会这么判断词素单元：

 switch (ls->t.token) {

     case '(': {

      //...

     }

     case TK_NAME: {

       //...

     }

     default: {

       //...

     }

在Token中SemInfo存放了一些语义相关的一些内容信息

 typedef union {

   lua_Number r;

   TString *ts;

 } SemInfo;  /* semantics information */

其中当token是数字是内容存放在r中，其他情况存放在ts指向的TString中。

下面是最重要的一个数据结构之一

 typedef struct LexState {

   int current;  /* current character (charint) */

   int linenumber;  /* input line counter */

   int lastline;  /* line of last token `consumed' */

   Token t;  /* current token */

   Token lookahead;  /* look ahead token */

   struct FuncState *fs;  /* `FuncState' is private to the parser */

   struct lua_State *L;

   ZIO *z;  /* input stream */

   Mbuffer *buff;  /* buffer for tokens */

   TString *source;  /* current source name */

   char decpoint;  /* locale decimal point */

 } LexState;

LexState不仅用于保存当前的词法分析状态信息，而且也保存了整个编译系统的全局状态。current指向了当前字符，t存放了当前的toekn，lookahead存放了向前看的token，由此我认为lua应该是ll(1)的~哈哈（不知道对不对）。fs指向了parser当前解析的函数的一些相关的信息，L指向了当前lua_State结构，z指向输入流，buff指向了token buffer，其他的看注释吧。

下面看看lparse.h文件中的重要结构：

 typedef struct expdesc {

   expkind k;

   union {

     struct { int info, aux; } s;

     lua_Number nval;

   } u;

   int t;  /* patch list of `exit when true' */

   int f;  /* patch list of `exit when false' */

 } expdesc;

expdesc是存放了表达式的相关描述信息，k是表达式的种类，u在不同的表达式中有不同的含义。

 typedef struct upvaldesc {

   lu_byte k;

   lu_byte info;

 } upvaldesc;

upvaldesc是存放了upval的相关描述信息。

最后是本文件中最重要的结构：

 typedef struct FuncState {

   Proto *f;  /* current function header */

   Table *h;  /* table to find (and reuse) elements in `k' */

   struct FuncState *prev;  /* enclosing function */

   struct LexState *ls;  /* lexical state */

   struct lua_State *L;  /* copy of the Lua state */

   struct BlockCnt *bl;  /* chain of current blocks */

   int pc;  /* next position to code (equivalent to `ncode') */

   int lasttarget;   /* `pc' of last `jump target' */

   int jpc;  /* list of pending jumps to `pc' */

   int freereg;  /* first free register */

   int nk;  /* number of elements in `k' */

   int np;  /* number of elements in `p' */

   short nlocvars;  /* number of elements in `locvars' */

   lu_byte nactvar;  /* number of active local variables */

   upvaldesc upvalues[LUAI_MAXUPVALUES];  /* upvalues */

   unsigned short actvar[LUAI_MAXVARS];  /* declared-variable stack */

 } FuncState;

在编译过程中，使用FuncState结构体来保存一个函数编译的状态数据。其中，f指向了本函数的协议描述结构体，prev指向了其父函数的FuncState描述，因为在lua中可以在一个函数中定义另一个函数，因此当parse到一个函数的内部函数的定义时会new一个FuncState来描述内部函数，同时开始parse这个内部函数，将这个FuncState的prev指向其外部函数的FuncState，prev变量用来引用外围函数的FuncState，使当前所有没有分析完成的FuncState形成一个栈结构。bl指向当前parse的block，在一个函数中会有很多block代码，lua会将这些同属于同一个函数的block用链表串联起来。jpc是一个OP_JMP指令的链表，因为lua是一遍过的parse，在开始的时候有一些跳转指令不能决定其跳转位置，因此jpc将这些pending jmp指令串联起来，在以后能确定的时候回填，freereg为第一个空闲寄存器的下标，upvalues数组保存了当前函数的所有upvalue，nactvar是当前作用域的局部变量数。

在lparse.c中定义了BlockCnt

 /*

 ** nodes for block list (list of active blocks)

 */

 typedef struct BlockCnt {

   struct BlockCnt *previous;  /* chain */

   int breaklist;  /* list of jumps out of this loop */

   lu_byte nactvar;  /* # active locals outside the breakable structure */

   lu_byte upval;  /* true if some variable in the block is an upvalue */

   lu_byte isbreakable;  /* true if `block' is a loop */

 } BlockCnt;

Lua使用BlockCnt来保存一个block的数据。与FuncState的分析方法类似，BlockCnt使用一个previous变量保存外围block的引用，形成一个栈结构。

lua解析脚本过程中的关键数据结构介绍

下面介绍一些在lobject.h文件里面的数据结构

 /*

 ** Function Prototypes

 */

 typedef struct Proto {

   CommonHeader;

   TValue *k;  /* constants used by the function */

   Instruction *code;

   struct Proto **p;  /* functions defined inside the function */

   int *lineinfo;  /* map from opcodes to source lines */

   struct LocVar *locvars;  /* information about local variables */

   TString **upvalues;  /* upvalue names */

   TString  *source;

   int sizeupvalues;

   int sizek;  /* size of `k' */

   int sizecode;

   int sizelineinfo;

   int sizep;  /* size of `p' */

   int sizelocvars;

   int linedefined;

   int lastlinedefined;

   GCObject *gclist;

   lu_byte nups;  /* number of upvalues */

   lu_byte numparams;

   lu_byte is_vararg;

   lu_byte maxstacksize;

 } Proto;

结构体Proto是lua函数协议的描述，在lua解析脚本时首先会将main chunk代码包裹为一个函数，用main proto描述，接着将里面定义的内部函数一一用Proto结构体描述，将这些Proto的关系用树来组合起来，例如有lua源码文件如下

 a =

 function f1()

 -- ...

 end

 function f2()

     function f3()

     -- ...

     end

 end

则parse完成后会有如图如下关系

lua解析脚本过程中的关键数据结构介绍

在Proto结构体中，k指向一个const变量数组，存放则函数要用到的常量；code指向lua parse过程中生成的本函数的instruction集合；p就是指向本函数内部定义的函数的那些proto；locvars指向本函数局部变量数组；upvalues指向本函数upvalue变量数组；nups为upvalue的数量；numparams为函数参数的数量；is_vararg表示函数是否接收可变参数；maxstacksize为函数stack的max大小。

在编译期间lua使用Proto描述函数的，当lua vm开始运行vm时需要根据Proto生成相应的Closure来执行vm instructions。

 typedef union Closure {

   CClosure c;

   LClosure l;

 } Closure;

Closure要么代表了c函数，要么为lua函数，在这里我们只看lua函数的LClosure

 #define ClosureHeader \

     CommonHeader; lu_byte isC; lu_byte nupvalues; GCObject *gclist; \

     struct Table *env

 //... ...

 typedef struct LClosure {

   ClosureHeader;

   struct Proto *p;

   UpVal *upvals[];

 } LClosure;

在LClousre中，p就是指向对应函数的Proto结构体啦，upvals顾名思义就是此closure的upvalue数组罗。在ClosureHeader宏中isC表示此closure是否是c函数，nupvalues为upvalue数目，env指向了此closue运行时的函数环境，在lua中可以用stefenv来改变当前函数的环境，就是改变env变量的指向啦。

最后，在文件lopcode.h中定义了lua vm的指令结构

lua解析脚本过程中的关键数据结构介绍

下面是vm指令的一些定义与描述，我在相应vm指令的上方添加了一些注释

 typedef enum {

 /*----------------------------------------------------------------------

 name        args    description

 ------------------------------------------------------------------------*/

 OP_MOVE,/*    A B    R(A) := R(B)                    */

 //Constants are usually numbers or strings. Each function has its own constant list, or pool.

 OP_LOADK,/*    A Bx    R(A) := Kst(Bx)                    */

 OP_LOADBOOL,/*    A B C    R(A) := (Bool)B; if (C) pc++            */

 //The optimization rule is  a simple one: If no other instructions have been generated,

 //then a LOADNIL as the first instruction can be optimized away.

 OP_LOADNIL,/*    A B    R(A) := ... := R(B) := nil            */

 OP_GETUPVAL,/*    A B    R(A) := UpValue[B]                */

 OP_GETGLOBAL,/*    A Bx    R(A) := Gbl[Kst(Bx)]                */

 OP_GETTABLE,/*    A B C    R(A) := R(B)[RK(C)]                */

 OP_SETGLOBAL,/*    A Bx    Gbl[Kst(Bx)] := R(A)                */

 OP_SETUPVAL,/*    A B    UpValue[B] := R(A)                */

 OP_SETTABLE,/*    A B C    R(A)[RK(B)] := RK(C)                */

 OP_NEWTABLE,/*    A B C    R(A) := {} (size = B,C)                */

 //This instruction is used for object-oriented programming. It is only generated for method calls that use the colon syntax.

 //R(B) is the register holding the reference to the table with the method.

 OP_SELF,/*    A B C    R(A+1) := R(B); R(A) := R(B)[RK(C)]        */

 //The optimization rule is simple: If both terms of a subexpression are numbers,

 //the subexpression will be evaluated at compile time.

 OP_ADD,/*    A B C    R(A) := RK(B) + RK(C)                */

 OP_SUB,/*    A B C    R(A) := RK(B) - RK(C)                */

 OP_MUL,/*    A B C    R(A) := RK(B) * RK(C)                */

 OP_DIV,/*    A B C    R(A) := RK(B) / RK(C)                */

 OP_MOD,/*    A B C    R(A) := RK(B) % RK(C)                */

 OP_POW,/*    A B C    R(A) := RK(B) ^ RK(C)                */

 OP_UNM,/*    A B    R(A) := -R(B)                    */

 OP_NOT,/*    A B    R(A) := not R(B)                */

 //Returns the length of the object in R(B)

 OP_LEN,/*    A B    R(A) := length of R(B)                */

 //Performs concatenation of two or more strings.

 //The source registers must be consecutive, and C must always be greater than B.

 OP_CONCAT,/*    A B C    R(A) := R(B).. ... ..R(C)            */

 //if sBx is 0, the VM will proceed to the next instruction

 OP_JMP,/*    sBx    pc+=sBx                    */

 /*If the boolean result is not A, then skip the next instruction.

 Conversely, if the boolean result equals A, continue with the next instruction.*/

 OP_EQ,/*    A B C    if ((RK(B) == RK(C)) ~= A) then pc++        */

 OP_LT,/*    A B C    if ((RK(B) <  RK(C)) ~= A) then pc++          */

 OP_LE,/*    A B C    if ((RK(B) <= RK(C)) ~= A) then pc++          */

 OP_TEST,/*    A C    if not (R(A) <=> C) then pc++            */

 //register R(B) is coerced into a boolean.

 OP_TESTSET,/*    A B C    if (R(B) <=> C) then R(A) := R(B) else pc++    */ 

 //If B is 0, parameters range from R(A+1) to the top of the stack.If B is 1, the function has no parameters.

 //If C is 1, no return results are saved. If C is 0, then multiple return results are saved, depending on the called function

 //CALL always updates the top of stack value.

 OP_CALL,/*    A B C    R(A), ... ,R(A+C-2) := R(A)(R(A+1), ... ,R(A+B-1)) */

 OP_TAILCALL,/*    A B C    return R(A)(R(A+1), ... ,R(A+B-1))        */

 //If B is 1, there are no return values. If B is 0, the set of values from R(A) to the top of the stack is returned.

 OP_RETURN,/*    A B    return R(A), ... ,R(A+B-2)    (see note)    */

 //FORPREP initializes a numeric for loop, while FORLOOP performs an iteration of a numeric for loop.

 OP_FORLOOP,/*    A sBx    R(A)+=R(A+2);

             if R(A) <?= R(A+1) then { pc+=sBx; R(A+3)=R(A) }*/

 OP_FORPREP,/*    A sBx    R(A)-=R(A+2); pc+=sBx                */

 //Performs an iteration of a generic for loop.

 OP_TFORLOOP,/*    A C    R(A+3), ... ,R(A+2+C) := R(A)(R(A+1), R(A+2));

                         if R(A+3) ~= nil then R(A+2)=R(A+3) else pc++    */

 //This instruction is used to initialize array elements in a table.

 //If B is 0, the table is set with a variable number of array elements, from register R(A+1) up to the top of the stack.

 //If C is 0, the next instruction is cast as an integer, and used as the C value.

 OP_SETLIST,/*    A B C    R(A)[(C-1)*FPF+i] := R(A+i), 1 <= i <= B    */

 /*If a local is used as an upvalue, then the local variable need to be placed somewhere,

 other wise it will go out of scope and disappear when a lexicalblock enclosing the local variable ends.

 CLOSE performs this operation for all affected local variables for do end blocks or loop blocks.

 RETURN also does an implicit CLOSE when a function returns.*/

 OP_CLOSE,/*    A     close all variables in the stack up to (>=) R(A)*/

 /*Each upvalue corresponds to either a MOVE or a GETUPVAL pseudo-instruction.

 Only the B field on either of these pseudo-instructions are significant.*/

 //MOVE pseudo-instructions corresponds to local variable R(B) in the current lexical block.

 //GETUPVAL pseudo-instructions corresponds upvalue number B in the current lexical block.

 OP_CLOSURE,/*    A Bx    R(A) := closure(KPROTO[Bx], R(A), ... ,R(A+n))    */

 //If B is 0, VARARG copies as many values as it can based on the number of parameters passed.

 //If a fixed number of values is required, B is a value greater than 1.

 OP_VARARG/*    A B    R(A), R(A+1), ..., R(A+B-1) = vararg        */

 } OpCode;

秒客网

lua解析脚本过程中的关键数据结构介绍

相关文章