1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 | Lib/lib2to3/pgen2/conv.py
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. # Licensed to PSF under a Contributor Agreement. """Convert graminit.[ch] spit out by pgen to Python code. Pgen is the Python parser generator. It is useful to quickly create a parser from a grammar file in Python's grammar notation. But I don't want my parsers to be written in C (yet), so I'm translating the parsing tables to Python data structures and writing a Python parse engine. Note that the token numbers are constants determined by the standard Python tokenizer. The standard token module defines these numbers and their names (the names are not used much). The token numbers are hardcoded into the Python tokenizer and into pgen. A Python implementation of the Python tokenizer is also available, in the standard tokenize module. On the other hand, symbol numbers (representing the grammar's non-terminals) are assigned by pgen based on the actual grammar input. Note: this module is pretty much obsolete; the pgen module generates equivalent grammar tables directly from the Grammar.txt input file without having to invoke the Python pgen C program. """ # Python imports import re # Local imports from pgen2 import grammar, token class Converter(grammar.Grammar): """Grammar subclass that reads classic pgen output files. The run() method reads the tables as produced by the pgen parser generator, typically contained in two C files, graminit.h and graminit.c. The other methods are for internal use only. See the base class for more documentation. """ def run(self, graminit_h, graminit_c): """Load the grammar tables from the text files written by pgen.""" self.parse_graminit_h(graminit_h) self.parse_graminit_c(graminit_c) self.finish_off() def parse_graminit_h(self, filename): """Parse the .h file written by pgen. (Internal) This file is a sequence of #define statements defining the nonterminals of the grammar as numbers. We build two tables mapping the numbers to names and back. """ try: f = open(filename) except IOError, err: print "Can't open %s: %s" % (filename, err) return False self.symbol2number = {} self.number2symbol = {} lineno = 0 for line in f: lineno += 1 mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line) if not mo and line.strip(): print "%s(%s): can't parse %s" % (filename, lineno, line.strip()) else: symbol, number = mo.groups() number = int(number) assert symbol not in self.symbol2number assert number not in self.number2symbol self.symbol2number[symbol] = number self.number2symbol[number] = symbol return True def parse_graminit_c(self, filename): """Parse the .c file written by pgen. (Internal) The file looks as follows. The first two lines are always this: #include "pgenheaders.h" #include "grammar.h" After that come four blocks: 1) one or more state definitions 2) a table defining dfas 3) a table defining labels 4) a struct defining the grammar A state definition has the following form: - one or more arc arrays, each of the form: static arc arcs_<n>_<m>[<k>] = { {<i>, <j>}, ... }; - followed by a state array, of the form: static state states_<s>[<t>] = { {<k>, arcs_<n>_<m>}, ... }; """ try: f = open(filename) except IOError, err: print "Can't open %s: %s" % (filename, err) return False # The code below essentially uses f's iterator-ness! lineno = 0 # Expect the two #include lines lineno, line = lineno+1, f.next() assert line == '#include "pgenheaders.h"\n', (lineno, line) lineno, line = lineno+1, f.next() assert line == '#include "grammar.h"\n', (lineno, line) # Parse the state definitions lineno, line = lineno+1, f.next() allarcs = {} states = [] while line.startswith("static arc "): while line.startswith("static arc "): mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$", line) assert mo, (lineno, line) n, m, k = map(int, mo.groups()) arcs = [] for _ in range(k): lineno, line = lineno+1, f.next() mo = re.match(r"\s+{(\d+), (\d+)},$", line) assert mo, (lineno, line) i, j = map(int, mo.groups()) arcs.append((i, j)) lineno, line = lineno+1, f.next() assert line == "};\n", (lineno, line) allarcs[(n, m)] = arcs lineno, line = lineno+1, f.next() mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line) assert mo, (lineno, line) s, t = map(int, mo.groups()) assert s == len(states), (lineno, line) state = [] for _ in range(t): lineno, line = lineno+1, f.next() mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line) assert mo, (lineno, line) k, n, m = map(int, mo.groups()) arcs = allarcs[n, m] assert k == len(arcs), (lineno, line) state.append(arcs) states.append(state) lineno, line = lineno+1, f.next() assert line == "};\n", (lineno, line) lineno, line = lineno+1, f.next() self.states = states # Parse the dfas dfas = {} mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line) assert mo, (lineno, line) ndfas = int(mo.group(1)) for i in range(ndfas): lineno, line = lineno+1, f.next() mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$', line) assert mo, (lineno, line) symbol = mo.group(2) number, x, y, z = map(int, mo.group(1, 3, 4, 5)) assert self.symbol2number[symbol] == number, (lineno, line) assert self.number2symbol[number] == symbol, (lineno, line) assert x == 0, (lineno, line) state = states[z] assert y == len(state), (lineno, line) lineno, line = lineno+1, f.next() mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line) assert mo, (lineno, line) first = {} rawbitset = eval(mo.group(1)) for i, c in enumerate(rawbitset): byte = ord(c) for j in range(8): if byte & (1<<j): first[i*8 + j] = 1 dfas[number] = (state, first) lineno, line = lineno+1, f.next() assert line == "};\n", (lineno, line) self.dfas = dfas # Parse the labels labels = [] lineno, line = lineno+1, f.next() mo = re.match(r"static label labels\[(\d+)\] = {$", line) assert mo, (lineno, line) nlabels = int(mo.group(1)) for i in range(nlabels): lineno, line = lineno+1, f.next() mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line) assert mo, (lineno, line) x, y = mo.groups() x = int(x) if y == "0": y = None else: y = eval(y) labels.append((x, y)) lineno, line = lineno+1, f.next() assert line == "};\n", (lineno, line) self.labels = labels # Parse the grammar struct lineno, line = lineno+1, f.next() assert line == "grammar _PyParser_Grammar = {\n", (lineno, line) lineno, line = lineno+1, f.next() mo = re.match(r"\s+(\d+),$", line) assert mo, (lineno, line) ndfas = int(mo.group(1)) assert ndfas == len(self.dfas) lineno, line = lineno+1, f.next() assert line == "\tdfas,\n", (lineno, line) lineno, line = lineno+1, f.next() mo = re.match(r"\s+{(\d+), labels},$", line) assert mo, (lineno, line) nlabels = int(mo.group(1)) assert nlabels == len(self.labels), (lineno, line) lineno, line = lineno+1, f.next() mo = re.match(r"\s+(\d+)$", line) assert mo, (lineno, line) start = int(mo.group(1)) assert start in self.number2symbol, (lineno, line) self.start = start lineno, line = lineno+1, f.next() assert line == "};\n", (lineno, line) try: lineno, line = lineno+1, f.next() except StopIteration: pass else: assert 0, (lineno, line) def finish_off(self): """Create additional useful structures. (Internal).""" self.keywords = {} # map from keyword strings to arc labels self.tokens = {} # map from numeric token values to arc labels for ilabel, (type, value) in enumerate(self.labels): if type == token.NAME and value is not None: self.keywords[value] = ilabel elif value is None: self.tokens[type] = ilabel |