| 1 | #!/usr/bin/python |
|---|
| 2 | |
|---|
| 3 | from re import compile as re |
|---|
| 4 | from json import dumps as json |
|---|
| 5 | |
|---|
| 6 | #----------------------------------------------------------------------------- |
|---|
| 7 | |
|---|
| 8 | def check_regexp(pattern, text): |
|---|
| 9 | match = pattern.match(text) |
|---|
| 10 | if match: |
|---|
| 11 | return match.end() |
|---|
| 12 | else: |
|---|
| 13 | return None |
|---|
| 14 | |
|---|
| 15 | def check_string(pattern, text): |
|---|
| 16 | if text.startswith(pattern): |
|---|
| 17 | return len(pattern) |
|---|
| 18 | else: |
|---|
| 19 | return None |
|---|
| 20 | |
|---|
| 21 | _leading_spaces = re(r' +') |
|---|
| 22 | def leading_spaces(text): |
|---|
| 23 | match = _leading_spaces.match(text) |
|---|
| 24 | if match: return match.end() |
|---|
| 25 | else: return 0 |
|---|
| 26 | |
|---|
| 27 | class Lexer: |
|---|
| 28 | expected_indent = 2 |
|---|
| 29 | |
|---|
| 30 | # possible flags: |
|---|
| 31 | # BOL -- token only occurs at begin of line |
|---|
| 32 | # EOL -- token only occurs at end of line |
|---|
| 33 | # IS_EOL -- next token after this one is considered to be at BOL |
|---|
| 34 | # SKIP -- token should not be reported to parser |
|---|
| 35 | # INDENT -- track indentation |
|---|
| 36 | # PRE -- this flag is for PRE token only: enables un-\-escaping BOL |
|---|
| 37 | # and strips first and last line (including trailing \n) from |
|---|
| 38 | # text |
|---|
| 39 | tokens = [ |
|---|
| 40 | # line ending |
|---|
| 41 | (re(r' +\. *\n'), 'LINE_BREAK_INDENTED', ['BOL', 'IS_EOL']), |
|---|
| 42 | (re(r'\. *\n'), 'LINE_BREAK', ['BOL', 'IS_EOL']), |
|---|
| 43 | (re(r' *\n'), 'EMPTY_LINE', ['BOL', 'IS_EOL', 'RESET_INDENT']), |
|---|
| 44 | (re(r' *\n'), 'NL', ['IS_EOL']), |
|---|
| 45 | |
|---|
| 46 | # preformatted text |
|---|
| 47 | (re(r'\{\{\{ *\n(.*\n)*?\}\}\} *\n'), 'PRE', ['BOL', 'IS_EOL', 'PRE']), |
|---|
| 48 | |
|---|
| 49 | # change of section |
|---|
| 50 | (re(r'____+ *\n'), 'NEW_SECTION', ['BOL', 'IS_EOL']), |
|---|
| 51 | |
|---|
| 52 | (re(r'\{\{\{ *\n'), '{{{', ['IS_EOL']), |
|---|
| 53 | (re(r'\}\}\} *\n'), '}}}', ['BOL', 'IS_EOL', 'RESET_INDENT']), |
|---|
| 54 | |
|---|
| 55 | # in-line formatting |
|---|
| 56 | (re(r'={1,5}'), 'HEADER_S', ['BOL']), |
|---|
| 57 | (re(r'={1,5} *'), 'HEADER_E', ['EOL']), |
|---|
| 58 | ('**', '**'), |
|---|
| 59 | ('__', '__'), |
|---|
| 60 | ('~~', '~~'), |
|---|
| 61 | ('{{', '{{'), |
|---|
| 62 | ('}}', '}}'), |
|---|
| 63 | |
|---|
| 64 | # [[@macros]] and [[%blocks]] |
|---|
| 65 | (re(r'\[\[@@[a-zA-Z_][a-zA-Z0-9_]*(#[a-zA-Z_][a-zA-Z0-9_]*)?'), 'MACRO_CALL_SILENT'), |
|---|
| 66 | (re( r'\[\[@[a-zA-Z_][a-zA-Z0-9_]*(#[a-zA-Z_][a-zA-Z0-9_]*)?'), 'MACRO_CALL'), |
|---|
| 67 | (re( r'\$[a-zA-Z_][a-zA-Z0-9_]*'), 'MACRO_VALUE'), |
|---|
| 68 | (re( r'\$\{[a-zA-Z_][a-zA-Z0-9_]*\}'), 'MACRO_VALUE'), |
|---|
| 69 | (re( r'\[\[%[a-zA-Z_][a-zA-Z0-9_]*'), 'BLOCK_CALL'), |
|---|
| 70 | (',', 'COMMA'), |
|---|
| 71 | (re(r'"([^"\\\n]|\\.)*"'), 'STRING'), |
|---|
| 72 | # see also: NUMBER |
|---|
| 73 | |
|---|
| 74 | # images and links |
|---|
| 75 | ('[[!', '[[!'), # image |
|---|
| 76 | ('[[', '[['), # link |
|---|
| 77 | (']]', ']]'), # this actually is a generic terminator |
|---|
| 78 | |
|---|
| 79 | # stuff related to formatting whole blocks (lines, paragraphs) |
|---|
| 80 | (re(r' *#.*\n'), 'COMMENT', ['BOL', 'IS_EOL', 'SKIP']), |
|---|
| 81 | (re(r' +[0-9]+\. '), 'LIST_NUMERIC', ['BOL', 'INDENT']), |
|---|
| 82 | (re(r' +[a-z]\. '), 'LIST_ALPHA', ['BOL', 'INDENT']), |
|---|
| 83 | (re(r' +\* '), 'LIST_BULLET', ['BOL', 'INDENT']), |
|---|
| 84 | (re(r' +\[\] '), 'LIST_UNDECO', ['BOL', 'INDENT']), |
|---|
| 85 | (re(r' +'), 'INDENT', ['BOL', 'INDENT', 'SKIP']), |
|---|
| 86 | ('||*', 'TABLE_ROW_H_START', ['BOL']), |
|---|
| 87 | (re(r'\*\|\| *'), 'TABLE_ROW_H_END', ['EOL']), |
|---|
| 88 | ('||', 'TABLE_ROW_START', ['BOL']), |
|---|
| 89 | (re(r'\|\| *'), 'TABLE_ROW_END', ['EOL']), |
|---|
| 90 | ('*||*', 'TABLE_CELL_LR'), |
|---|
| 91 | ('*||', 'TABLE_CELL_L'), |
|---|
| 92 | ('||*', 'TABLE_CELL_R'), |
|---|
| 93 | ('||', 'TABLE_CELL'), |
|---|
| 94 | |
|---|
| 95 | # the rest of stuff |
|---|
| 96 | (re(r'\\.'), 'Q_CHAR'), |
|---|
| 97 | (re(r' +'), ' '), |
|---|
| 98 | (re(r'[a-zA-Z]+'), 'WORD'), |
|---|
| 99 | (re(r'-?([1-9][0-9]*|0)(\.[0-9]+)?([eE][+-]?[0-9]+)?'), 'NUMBER'), |
|---|
| 100 | (re(r'.'), 'PUNCT'), |
|---|
| 101 | ] |
|---|
| 102 | |
|---|
| 103 | def __init__(self, text = None): |
|---|
| 104 | self.text = text.replace("\r", "") |
|---|
| 105 | self.begin_of_line = True |
|---|
| 106 | self.position = 0 |
|---|
| 107 | self.line = 1 |
|---|
| 108 | self.char = 1 |
|---|
| 109 | self.backlog_stack = [] |
|---|
| 110 | self.indent = 0 |
|---|
| 111 | |
|---|
| 112 | def __iter__(self): |
|---|
| 113 | return self |
|---|
| 114 | |
|---|
| 115 | def next(self): |
|---|
| 116 | if len(self.backlog_stack) > 0: |
|---|
| 117 | return self.backlog_stack.pop() |
|---|
| 118 | |
|---|
| 119 | if self.text == '': |
|---|
| 120 | if self.indent > 0: |
|---|
| 121 | nmarkers = self.indent / self.expected_indent |
|---|
| 122 | marker = ('END_INDENT', '', { |
|---|
| 123 | 'byte': self.position, |
|---|
| 124 | 'line': self.line, |
|---|
| 125 | 'char': self.char, |
|---|
| 126 | }) |
|---|
| 127 | |
|---|
| 128 | for i in xrange(nmarkers - 1): |
|---|
| 129 | self.backlog_stack.append(marker) |
|---|
| 130 | |
|---|
| 131 | self.indent = 0 |
|---|
| 132 | |
|---|
| 133 | return marker |
|---|
| 134 | |
|---|
| 135 | raise StopIteration() |
|---|
| 136 | |
|---|
| 137 | result = None |
|---|
| 138 | |
|---|
| 139 | for token in self.tokens: |
|---|
| 140 | # XXX: options processing |
|---|
| 141 | if len(token) > 2: |
|---|
| 142 | # skip token with option if the option is not satisfied |
|---|
| 143 | if 'BOL' in token[2] and not self.begin_of_line: |
|---|
| 144 | continue |
|---|
| 145 | |
|---|
| 146 | # check if the token matches |
|---|
| 147 | if type(token[0]) == str: |
|---|
| 148 | match_length = check_string(token[0], self.text) |
|---|
| 149 | else: |
|---|
| 150 | match_length = check_regexp(token[0], self.text) |
|---|
| 151 | |
|---|
| 152 | # XXX: could be 0 (False) only on matching empty string |
|---|
| 153 | if match_length: |
|---|
| 154 | # XXX: options processing |
|---|
| 155 | if len(token) > 2: |
|---|
| 156 | # skip token with EOL option if it's not end of line (either on end |
|---|
| 157 | # of string or real EOL) |
|---|
| 158 | if 'EOL' in token[2] and not \ |
|---|
| 159 | (len(self.text) == match_length or self.text[match_length] == '\n'): |
|---|
| 160 | continue |
|---|
| 161 | |
|---|
| 162 | # prepare values required to build the result |
|---|
| 163 | token_name = token[1] |
|---|
| 164 | token_value = self.text[0:match_length] |
|---|
| 165 | token_position = { |
|---|
| 166 | 'byte': self.position, |
|---|
| 167 | 'line': self.line, |
|---|
| 168 | 'char': self.char, |
|---|
| 169 | } |
|---|
| 170 | if len(token) > 2: |
|---|
| 171 | token_flags = token[2] |
|---|
| 172 | else: |
|---|
| 173 | token_flags = [] |
|---|
| 174 | |
|---|
| 175 | # advance in the text |
|---|
| 176 | self.text = self.text[match_length:] |
|---|
| 177 | |
|---|
| 178 | # update token position |
|---|
| 179 | self.position += match_length |
|---|
| 180 | match_lines = token_value.split('\n') |
|---|
| 181 | if len(match_lines) > 1: |
|---|
| 182 | self.line += len(match_lines) - 1 |
|---|
| 183 | self.char = len(match_lines[-1]) + 1 |
|---|
| 184 | else: |
|---|
| 185 | self.char += len(token_value) |
|---|
| 186 | |
|---|
| 187 | result = (token_name, token_value, token_position, token_flags) |
|---|
| 188 | break |
|---|
| 189 | |
|---|
| 190 | if result == None: |
|---|
| 191 | # because of PUNCT this should never be reached |
|---|
| 192 | return ('UNKNOWN_TOKEN', self.text[0:16]) |
|---|
| 193 | |
|---|
| 194 | was_begin_of_line = self.begin_of_line |
|---|
| 195 | |
|---|
| 196 | # set new line marker |
|---|
| 197 | if 'IS_EOL' in result[3]: |
|---|
| 198 | self.begin_of_line = True |
|---|
| 199 | else: |
|---|
| 200 | self.begin_of_line = False |
|---|
| 201 | |
|---|
| 202 | if 'RESET_INDENT' in result[3]: |
|---|
| 203 | if self.indent > 0: |
|---|
| 204 | nmarkers = self.indent / self.expected_indent |
|---|
| 205 | marker = ('END_INDENT', '', result[2]) |
|---|
| 206 | |
|---|
| 207 | self.backlog_stack.append(result[0:3]) |
|---|
| 208 | for i in xrange(nmarkers - 1): |
|---|
| 209 | self.backlog_stack.append(marker) |
|---|
| 210 | |
|---|
| 211 | self.indent = 0 |
|---|
| 212 | |
|---|
| 213 | return marker |
|---|
| 214 | |
|---|
| 215 | if 'SKIP' in result[3]: |
|---|
| 216 | # get next token (skip this one) |
|---|
| 217 | return self.next() |
|---|
| 218 | elif 'INDENT' in result[3] and was_begin_of_line: |
|---|
| 219 | # track indentation |
|---|
| 220 | indent = leading_spaces(result[1]) |
|---|
| 221 | indent_diff = indent - self.indent |
|---|
| 222 | self.indent = indent |
|---|
| 223 | |
|---|
| 224 | if indent_diff == 0: |
|---|
| 225 | # indentation level kept |
|---|
| 226 | return result[0:3] |
|---|
| 227 | |
|---|
| 228 | if abs(indent_diff) % self.expected_indent != 0: |
|---|
| 229 | # TODO: change the exception class |
|---|
| 230 | raise Exception( |
|---|
| 231 | 'Invalid indentation at line %d, char %d' % (self.line, self.char) |
|---|
| 232 | ) |
|---|
| 233 | |
|---|
| 234 | if indent_diff > 0: |
|---|
| 235 | marker = ('BEGIN_INDENT', '', result[2]) # indentation increased |
|---|
| 236 | else: |
|---|
| 237 | marker = ('END_INDENT', '', result[2]) # indentation decreased |
|---|
| 238 | |
|---|
| 239 | nmarkers = abs(indent_diff) / self.expected_indent |
|---|
| 240 | self.backlog_stack.append(result[0:3]) |
|---|
| 241 | for i in xrange(nmarkers - 1): |
|---|
| 242 | self.backlog_stack.append(marker) |
|---|
| 243 | |
|---|
| 244 | return marker |
|---|
| 245 | elif 'PRE' in result[3]: |
|---|
| 246 | # XXX: PRE postprocessing: |
|---|
| 247 | # * strip "{{{\n" and "\n}}}\n" markers (remember that people |
|---|
| 248 | # recklessly put spaces at EOL) |
|---|
| 249 | # * strip "\n\\" from whole text (consistently) |
|---|
| 250 | |
|---|
| 251 | # NOTE: now it's good time to do the second stripping, as the begin of |
|---|
| 252 | # actual data still starts after newline character |
|---|
| 253 | pre_text = result[1].replace('\n\\', '\n') |
|---|
| 254 | s = pre_text.find('\n') + 1 |
|---|
| 255 | e = pre_text.rfind('\n', 0, -1) |
|---|
| 256 | pre_text = pre_text[s:e] |
|---|
| 257 | return (result[0], pre_text, result[2]) |
|---|
| 258 | else: |
|---|
| 259 | return result[0:3] |
|---|
| 260 | |
|---|
| 261 | #----------------------------------------------------------------------------- |
|---|
| 262 | # vim:ft=python |
|---|