utf8 input support, environment matching in parser
authorKai Dietrich <mail@cleeus.de>
Thu, 13 Jan 2011 06:44:30 +0000 (07:44 +0100)
committerKai Dietrich <mail@cleeus.de>
Thu, 13 Jan 2011 06:44:30 +0000 (07:44 +0100)
yappstest/w2b.g
yappstest/w2b_test.py

index 56f856c430dfde5313cdeea2c80b31cc440c75c6..21a42035f0c140777a925ce0bb2831e1bcafa08d 100644 (file)
@@ -1,60 +1,82 @@
 #first sketches for a yapps2 LL(1) grammar for wiki2beamer and some LaTeX
 
-def match_env(env_in, env_body, env_out):
-       if env_in != env_out:
-               raise SyntaxError('Opened environment %s doesn\'t match closed environment %s.' % (env_open, env_close))
+#The resulting syntax tree is supposed to be a lossless representation of the parsing paths.
+#Every production rule is supposed to return with a tuple ('name_of_rule', [ contents ] ).
+#Every token is supposed to return as a tuple ('TOKEN_NAME', token_content).
+#It should be possible to reconstruct the original parser input from the syntax tree.
+#The parser/scanner is supposed to work on unicode input, not in str input. Thus all
+#tokens should be unicode regular expressions (starting with "(?u)"). 
+
+#DONE:
+# - lists
+# - environment matching
+
+#TODO:
+# - utf8 input / unicode parsing
+# - environment options
+# - escaping
+# - nowiki
+# - code
+# - typesetting
+# - autotemplate
+# - frames
+# - sectioning
+
+def match_env(tokenname, env_in, env_body, env_out):
+       if env_in[1][1] != env_out[1][1]:
+               raise SyntaxError('Opened environment %s doesn\'t match closed environment %s.' % (env_in[1][1], env_out[1][1]))
                return None
        
-       return ('W2B_ENV', [env_in, env_body, env_out])
+       return (tokenname, [env_in, env_body, env_out])
 
 %%
 
 parser wiki2beamer:
        token END:              "$"
-       token SPACE:            "[ \\t]+"
-       token NEWLINE:          "\\r?\\n"
-       token PARBREAK:         "(\\r?\\n)(\\r?\\n)+"
-       token NUM:              "[0-9]+"
-       token WORD:             "[a-zA-Z]+"
-       token PUNCT:            "(,|\\.|\\?|:|;|\"|'|`|´)+"
-       token PUNCT_SPECIAL:    "[*#<>\\-]"
-       token MINUS:            "-"
-       token COMMA:            ","
-       token LATEX_COMMAND_NAME: "\\\\([a-zA-Z]+)"
-       token BRACKET_CURLY_L:  "{"
-       token BRACKET_CURLY_R:  "}"
-       token BRACKET_ANGLE_L:  "<"
-       token BRACKET_ANGLE_R:  ">"
-       token BRACKET_SQUARE_L: "\\["
-       token BRACKET_SQUARE_R: "\\]"
-       token W2B_H2_L:         "=="
-       token W2B_H2_R:         "=="
-       token W2B_H3_L:         "==="
-       token W2B_H3_R:         "==="
-       token W2B_H4_L:         "===="
-       token W2B_H4_R:         "===="
-       token W2B_ENDFRAME:     "\\[frame\\]>"
-       token OVERLAY_SPEC_SIMPLE: "[0-9-, \\t]+"
-       token W2B_LISTBLOCK_BEGIN: "\\r?\\n?[\\*#]+"
-       token W2B_ESC_EXCLM:    "\\\\!"
-       token W2B_ALERT_L:      "!"
-       token W2B_ALERT_R:      "!"
-       token W2B_ALERT_IN:     "[^!\\r\\n]*"
-       token W2B_BOLD_L:       "'''"
-       token W2B_BOLD_R:       "'''"
-       token W2B_BOLD_IN:      "[^'\\r\\n]*"
-       token W2B_ITALIC_L:     "''"
-       token W2B_ITALIC_R:     "''"
-       token W2B_ITALIC_IN:    "[^'\\r\\n]*"
-       token W2B_ESC_AT:       "\\\\@"
-       token W2B_TEXTTT_L:     "@"
-       token W2B_TEXTTT_R:     "@"
-       token W2B_TEXTTT_IN:    "[^@\\r\\n]*"
-       token W2B_TEXTCOLOR_L:          "_"
-       token W2B_TEXTCOLOR_COLOR:      "[^_\\r\\n]*"
-       token W2B_TEXTCOLOR_MID:        "_"
-       token W2B_TEXTCOLOR_IN:         "[^_\\r\\n]*"
-       token W2B_TEXTCOLOR_R:          "_"
+       token SPACE:            "(?u)[ \\t]+"
+       token NEWLINE:          "(?u)\\r?\\n"
+       token PARBREAK:         "(?u)\\r?\\n\\r?\\n"
+       token NUM:              "(?u)[0-9]+"
+       token WORD:             "(?u)\\w+"
+       token PUNCT:            "(?u)(,|\\.|\\?|:|;|\"|'|`|´)+"
+       token PUNCT_SPECIAL:    "(?u)[*#<>\\-]"
+       token MINUS:            "(?u)-"
+       token COMMA:            "(?u),"
+       token LATEX_COMMAND_NAME: "(?u)\\\\([a-zA-Z]+)"
+       token BRACKET_CURLY_L:  "(?u){"
+       token BRACKET_CURLY_R:  "(?u)}"
+       token BRACKET_ANGLE_L:  "(?u)<"
+       token BRACKET_ANGLE_R:  "(?u)>"
+       token BRACKET_SQUARE_L: "(?u)\\["
+       token BRACKET_SQUARE_R: "(?u)\\]"
+       token W2B_H2_L:         "(?u)=="
+       token W2B_H2_R:         "(?u)=="
+       token W2B_H3_L:         "(?u)==="
+       token W2B_H3_R:         "(?u)==="
+       token W2B_H4_L:         "(?u)===="
+       token W2B_H4_R:         "(?u)===="
+       token W2B_ENDFRAME:     "(?u)\\[frame\\]>"
+       token OVERLAY_SPEC_SIMPLE: "(?u)[0-9-, \\t]+"
+       token W2B_LISTBLOCK_BEGIN: "(?u)\\r?\\n?[\\*#]+"
+       token W2B_ESC_EXCLM:    "(?u)\\\\!"
+       token W2B_ALERT_L:      "(?u)!"
+       token W2B_ALERT_R:      "(?u)!"
+       token W2B_ALERT_IN:     "(?u)[^!\\r\\n]*"
+       token W2B_BOLD_L:       "(?u)'''"
+       token W2B_BOLD_R:       "(?u)'''"
+       token W2B_BOLD_IN:      "(?u)[^'\\r\\n]*"
+       token W2B_ITALIC_L:     "(?u)''"
+       token W2B_ITALIC_R:     "(?u)''"
+       token W2B_ITALIC_IN:    "(?u)[^'\\r\\n]*"
+       token W2B_ESC_AT:       "(?u)\\\\@"
+       token W2B_TEXTTT_L:     "(?u)@"
+       token W2B_TEXTTT_R:     "(?u)@"
+       token W2B_TEXTTT_IN:    "(?u)[^@\\r\\n]*"
+       token W2B_TEXTCOLOR_L:          "(?u)_"
+       token W2B_TEXTCOLOR_COLOR:      "(?u)[^_\\r\\n]*"
+       token W2B_TEXTCOLOR_MID:        "(?u)_"
+       token W2B_TEXTCOLOR_IN:         "(?u)[^_\\r\\n]*"
+       token W2B_TEXTCOLOR_R:          "(?u)_"
 
        token W2B_VSPACE_L:     "--"
        token W2B_VSPACE_R:     "--"
@@ -80,28 +102,57 @@ parser wiki2beamer:
 
        rule document:
                {{ result = [] }}
-               #[[(SPACE|NEWLINE)*] w2b_autotemplate {{result.append(w2b_autotemplate)}}]
-               (
-                       PARBREAK        {{result.append(('PARBREAK', PARBREAK))}}
-               |       paragraph       {{result.append(paragraph)}}
-               )*
+               par_set {{ result.append(par_set) }}
                END {{result.append(('END',))}}
                {{return result}}
                
+       rule par_set:
+               {{ result = [] }}
+               [ NEWLINE {{ result.append(('NEWLINE', NEWLINE)) }} ]
+               (
+                       par_break       {{ result.append(par_break) }}
+               |       par             {{ result.append(par) }}
+               |       w2b_env_multi_line {{ result.append(w2b_env_multi_line) }}
+               )+
+               [ NEWLINE {{ result.append(('NEWLINE', NEWLINE)) }} ]
+               {{ return ('par_set', result) }}
        
-       rule paragraph:
+       rule par_break:
+               {{ result = [] }}
+               PARBREAK {{ result.append(('PARBREAK', PARBREAK)) }}
+               (
+                       NEWLINE {{ result.append(('NEWLINE', NEWLINE)) }}
+               )*
+               {{ return ('par_break', result) }}
+               
+       rule par:
                {{result = None}}
                (
-                       w2b_textblock   {{result = w2b_textblock}}
-               |       w2b_listblock   {{result = w2b_listblock}}
+                       w2b_textblock     {{ result = w2b_textblock }}
+               |       w2b_listblock     {{ result = w2b_listblock }}
                )
-               {{return ('PARAGRAPH', result)}}
+               {{return ('par', result)}}
+       
+       rule w2b_env_multi_line:
+               {{ env_params = [] }}
+               w2b_env_open  {{ env_params.append(w2b_env_open)  }}
+               par_set       {{ env_params.append(par_set) }}
+               w2b_env_close {{ env_params.append(w2b_env_close) }}
+               {{ return match_env('w2b_env_multi_line', env_params[0], env_params[1], env_params[2]) }}
+
+       rule w2b_env_open:
+               W2B_ENV_OPEN_L W2B_ENV_NAME W2B_ENV_OPEN_R
+               {{return ('w2b_env_open', [W2B_ENV_OPEN_L, W2B_ENV_NAME, W2B_ENV_OPEN_R] ) }}
+               
+       rule w2b_env_close:
+               W2B_ENV_CLOSE_L W2B_ENV_NAME W2B_ENV_CLOSE_R
+               {{return ('w2b_env_close', [W2B_ENV_CLOSE_L, W2B_ENV_NAME, W2B_ENV_CLOSE_R] ) }}
        
        rule w2b_nowiki:
                W2B_NOWIKI_OPEN W2B_NOWIKI_IN W2B_NOWIKI_CLOSE
                {{return ('W2B_NOWIKI', W2B_NOWIKI_IN)}}
 
-       #TODO replace W2B_AUTOTEMPLATE_IN with a ruleset        
+       #TODO replace W2B_AUTOTEMPLATE_IN with a production rule
        rule w2b_autotemplate:
                W2B_AUTOTEMPLATE_OPEN W2B_AUTOTEMPLATE_IN W2B_AUTOTEMPLATE_CLOSE
                {{return ('AUTOTEMPLATE', W2B_AUTOTEMPLATE_IN)}}
@@ -112,27 +163,33 @@ parser wiki2beamer:
                        w2b_single_line {{result.append(w2b_single_line)}}
                        [NEWLINE {{result.append(('NEWLINE', NEWLINE))}}]
                )+
-               {{return ('W2B_TEXTBLOCK', result)}}
+               {{return ('w2b_textblock', result)}}
 
        rule w2b_listblock:
+               {{ result = [] }}
+               (
+                       w2b_listitem {{ result.append(w2b_listitem) }}
+               )+
+               {{ return ('w2b_listblock', result) }}
+       
+       rule w2b_listitem:
                {{ result = [] }}
                (
                        W2B_LISTBLOCK_BEGIN {{ result.append(('W2B_LISTBLOCK_BEGIN', W2B_LISTBLOCK_BEGIN)) }}
                        [ overlay_spec {{ result.append(overlay_spec) }} ]
                        [ SPACE {{ result.append(('SPACE', SPACE)) }} ]
-                       [ w2b_single_line {{ result.append( w2b_single_line ) }} ]
-               )+
-               {{ return ('W2B_LIST_BLOCK', result) }}
-       
+                       [ w2b_single_line_with_env {{ result.append( w2b_single_line_with_env ) }} ]
+               )
+               {{ return ('w2b_listitem', result) }}
+               
        rule w2b_single_line:
                {{ result = [] }}
-               
                (
                        w2b_single_line_simple {{ result.append(w2b_single_line_simple) }}
                        [ PUNCT_SPECIAL {{ result.append(('PUNCT_SPECIAL', PUNCT_SPECIAL)) }} ]
                )+
 
-               {{return ('W2B_SINGLE_LINE', result)}}
+               {{return ('w2b_single_line', result)}}
        
        rule w2b_single_line_simple:
                {{ result = [] }}
@@ -148,31 +205,25 @@ parser wiki2beamer:
                |       w2b_text_textcolor      {{ result.append(w2b_text_textcolor) }}
                |       w2b_vspace              {{ result.append(w2b_vspace) }}
                |       w2b_vspacestar          {{ result.append(w2b_vspacestar) }}
-               |       w2b_env_single_line     {{ result.append(w2b_env_single_line) }}
-               #|      w2b_env_open            {{ result.append(w2b_env_open) }}
-               #|      w2b_env_close           {{ result.append(w2b_env_close) }}
-               #|      w2b_nowiki              {{ result.append(w2b_nowiki) }}
                |       PUNCT                   {{ result.append(('PUNCT', PUNCT)) }}
                )+
-               {{return ('W2B_SINGLE_LINE', result)}}
+               {{return ('w2b_single_line_simple', result)}}
        
+       rule w2b_single_line_with_env:
+               {{ result = [] }}
+               (
+                       w2b_single_line     {{ result.append(w2b_single_line) }}
+               |       w2b_env_single_line {{ result.append(w2b_env_single_line) }}
+               )+
+               {{ return ('w2b_single_line_with_env', result) }}
 
-       #TODO environment open/close matching during parsing
        rule w2b_env_single_line:
-               {{ env_match_params = [None, None, None] }}
-               W2B_ENV_OPEN_L [SPACE] W2B_ENV_NAME [SPACE] W2B_ENV_OPEN_R {{ env_match_params[0] = W2B_ENV_NAME }}
-               w2b_single_line_simple {{ env_match_params[1] = w2b_single_line_simple }}
-               W2B_ENV_CLOSE_L [SPACE] W2B_ENV_NAME [SPACE] W2B_ENV_CLOSE_R {{ env_match_params[2] = W2B_ENV_NAME }}
-               {{ return ('W2B_ENV_SINGLE_LINE', match_env(*env_match_params)) }}
+               {{ env_params = [] }}
+               w2b_env_open  {{ env_params.append(w2b_env_open)  }}
+               w2b_single_line  {{ env_params.append(w2b_single_line) }}
+               w2b_env_close {{ env_params.append(w2b_env_close) }}
+               {{ return match_env('w2b_env_single_line', env_params[0], env_params[1], env_params[2]) }}
 
-       rule w2b_env_open:
-               W2B_ENV_OPEN_L [SPACE] W2B_ENV_NAME [SPACE] W2B_ENV_OPEN_R
-               {{return ('W2B_ENV_OPEN', W2B_ENV_NAME) }}
-               
-       rule w2b_env_close:
-               W2B_ENV_CLOSE_L [SPACE] W2B_ENV_NAME [SPACE] W2B_ENV_CLOSE_R
-               {{return ('W2B_ENV_CLOSE', W2B_ENV_NAME) }}
-       
        rule w2b_escape_seq:
                {{ result = None }}
                (
@@ -181,35 +232,35 @@ parser wiki2beamer:
                )
                {{ return result }}
        
-       #TODO hack parsing of typesetting contents outside of the grammar
+       #TODO parsing of typesetting contents
        rule w2b_text_alert:
                W2B_ALERT_L W2B_ALERT_IN W2B_ALERT_R
-               {{ return ('W2B_ALERT', W2B_ALERT_IN) }}
+               {{ return ('w2b_text_alert', W2B_ALERT_IN) }}
 
        rule w2b_text_bold:
                W2B_BOLD_L W2B_BOLD_IN W2B_BOLD_R
-               {{ return ('W2B_BOLD', W2B_BOLD_IN) }}
+               {{ return ('w2b_text_bold', W2B_BOLD_IN) }}
 
        rule w2b_text_italic:
                W2B_ITALIC_L W2B_ITALIC_IN W2B_ITALIC_R
-               {{ return ('W2B_ITALIC', W2B_ITALIC_IN) }}
+               {{ return ('w2b_text_italic', W2B_ITALIC_IN) }}
 
        rule w2b_text_texttt:
                W2B_TEXTTT_L W2B_TEXTTT_IN W2B_TEXTTT_R
-               {{ return ('W2B_TEXTTT', W2B_TEXTTT_IN) }}
+               {{ return ('w2b_text_texttt', W2B_TEXTTT_IN) }}
 
        rule w2b_text_textcolor:
                W2B_TEXTCOLOR_L W2B_TEXTCOLOR_COLOR W2B_TEXTCOLOR_MID W2B_TEXTCOLOR_IN W2B_TEXTCOLOR_R
-               {{return ('W2B_TEXTCOLOR', W2B_TEXTCOLOR_COLOR, W2B_TEXTCOLOR_IN)}}
+               {{return ('w2b_text_textcolor', W2B_TEXTCOLOR_COLOR, W2B_TEXTCOLOR_IN)}}
        
        rule w2b_vspace:
                W2B_VSPACE_L W2B_VSPACE_IN W2B_VSPACE_R
-               {{ return ('W2B_VSPACE', W2B_VSPACE_IN) }}
+               {{ return ('w2b_vspace', W2B_VSPACE_IN) }}
        
        rule w2b_vspacestar:
                W2B_VSPACESTAR_L W2B_VSPACE_IN W2B_VSPACE_R
-               {{ return ('W2B_VSPACESTAR', W2B_VSPACE_IN) }}
+               {{ return ('w2b_vspacestar', W2B_VSPACE_IN) }}
 
        rule overlay_spec:
                BRACKET_ANGLE_L OVERLAY_SPEC_SIMPLE BRACKET_ANGLE_R
-               {{return ('OVERLAY_SPEC', OVERLAY_SPEC_SIMPLE)}}
+               {{return ('overlay_spec', [BRACKET_ANGLE_L, OVERLAY_SPEC_SIMPLE, BRACKET_ANGLE_R] ) }}
index 91a1602de7d7508bfa8d3cb285cfeb9c03ed0359..5dde30e885e17c3f24d5f5ab8bc1ba156baf4737 100644 (file)
@@ -5,7 +5,9 @@ from yapps import yappsrt
 import traceback
 import sys
 
-def ast_print_escape(string):
+
+def ast_print_escape(ustring):
+    string = ustring.encode("utf8")
     string = string.replace('\n', '\\n')
     string = string.replace('\r', '\\r')
     string = string.replace('\t', '\\t')
@@ -31,6 +33,8 @@ def ast_print(ast_node, level=0):
             print '\n%s%s()' % (space_prefix, ast_node[0]) ,
 
     elif type(ast_node) == str:
+        print 'WARNING -- str found, fix parser!!!: "%s"' % (ast_print_escape(ast_node)) ,
+    elif type(ast_node) == unicode:
         print '"%s"' % (ast_print_escape(ast_node)) ,
     else:
         print type(ast_node)
@@ -46,6 +50,7 @@ if __name__ == '__main__':
             f = stdin
         
         text = f.read()
+        text = text.decode("utf8")
         scanner = w2b.wiki2beamerScanner(text)
         parser = w2b.wiki2beamer(scanner)
         ast = None