#first sketches for a yapps2 LL(1) grammar for wiki2beamer and some LaTeX
-def match_env(env_in, env_body, env_out):
- if env_in != env_out:
- raise SyntaxError('Opened environment %s doesn\'t match closed environment %s.' % (env_open, env_close))
+#The resulting syntax tree is supposed to be a lossless representation of the parsing paths.
+#Every production rule is supposed to return with a tuple ('name_of_rule', [ contents ] ).
+#Every token is supposed to return as a tuple ('TOKEN_NAME', token_content).
+#It should be possible to reconstruct the original parser input from the syntax tree.
+#The parser/scanner is supposed to work on unicode input, not in str input. Thus all
+#tokens should be unicode regular expressions (starting with "(?u)").
+
+#DONE:
+# - lists
+# - environment matching
+
+#TODO:
+# - utf8 input / unicode parsing
+# - environment options
+# - escaping
+# - nowiki
+# - code
+# - typesetting
+# - autotemplate
+# - frames
+# - sectioning
+
+def match_env(tokenname, env_in, env_body, env_out):
+ if env_in[1][1] != env_out[1][1]:
+ raise SyntaxError('Opened environment %s doesn\'t match closed environment %s.' % (env_in[1][1], env_out[1][1]))
return None
- return ('W2B_ENV', [env_in, env_body, env_out])
+ return (tokenname, [env_in, env_body, env_out])
%%
parser wiki2beamer:
token END: "$"
- token SPACE: "[ \\t]+"
- token NEWLINE: "\\r?\\n"
- token PARBREAK: "(\\r?\\n)(\\r?\\n)+"
- token NUM: "[0-9]+"
- token WORD: "[a-zA-Z]+"
- token PUNCT: "(,|\\.|\\?|:|;|\"|'|`|´)+"
- token PUNCT_SPECIAL: "[*#<>\\-]"
- token MINUS: "-"
- token COMMA: ","
- token LATEX_COMMAND_NAME: "\\\\([a-zA-Z]+)"
- token BRACKET_CURLY_L: "{"
- token BRACKET_CURLY_R: "}"
- token BRACKET_ANGLE_L: "<"
- token BRACKET_ANGLE_R: ">"
- token BRACKET_SQUARE_L: "\\["
- token BRACKET_SQUARE_R: "\\]"
- token W2B_H2_L: "=="
- token W2B_H2_R: "=="
- token W2B_H3_L: "==="
- token W2B_H3_R: "==="
- token W2B_H4_L: "===="
- token W2B_H4_R: "===="
- token W2B_ENDFRAME: "\\[frame\\]>"
- token OVERLAY_SPEC_SIMPLE: "[0-9-, \\t]+"
- token W2B_LISTBLOCK_BEGIN: "\\r?\\n?[\\*#]+"
- token W2B_ESC_EXCLM: "\\\\!"
- token W2B_ALERT_L: "!"
- token W2B_ALERT_R: "!"
- token W2B_ALERT_IN: "[^!\\r\\n]*"
- token W2B_BOLD_L: "'''"
- token W2B_BOLD_R: "'''"
- token W2B_BOLD_IN: "[^'\\r\\n]*"
- token W2B_ITALIC_L: "''"
- token W2B_ITALIC_R: "''"
- token W2B_ITALIC_IN: "[^'\\r\\n]*"
- token W2B_ESC_AT: "\\\\@"
- token W2B_TEXTTT_L: "@"
- token W2B_TEXTTT_R: "@"
- token W2B_TEXTTT_IN: "[^@\\r\\n]*"
- token W2B_TEXTCOLOR_L: "_"
- token W2B_TEXTCOLOR_COLOR: "[^_\\r\\n]*"
- token W2B_TEXTCOLOR_MID: "_"
- token W2B_TEXTCOLOR_IN: "[^_\\r\\n]*"
- token W2B_TEXTCOLOR_R: "_"
+ token SPACE: "(?u)[ \\t]+"
+ token NEWLINE: "(?u)\\r?\\n"
+ token PARBREAK: "(?u)\\r?\\n\\r?\\n"
+ token NUM: "(?u)[0-9]+"
+ token WORD: "(?u)\\w+"
+ token PUNCT: "(?u)(,|\\.|\\?|:|;|\"|'|`|´)+"
+ token PUNCT_SPECIAL: "(?u)[*#<>\\-]"
+ token MINUS: "(?u)-"
+ token COMMA: "(?u),"
+ token LATEX_COMMAND_NAME: "(?u)\\\\([a-zA-Z]+)"
+ token BRACKET_CURLY_L: "(?u){"
+ token BRACKET_CURLY_R: "(?u)}"
+ token BRACKET_ANGLE_L: "(?u)<"
+ token BRACKET_ANGLE_R: "(?u)>"
+ token BRACKET_SQUARE_L: "(?u)\\["
+ token BRACKET_SQUARE_R: "(?u)\\]"
+ token W2B_H2_L: "(?u)=="
+ token W2B_H2_R: "(?u)=="
+ token W2B_H3_L: "(?u)==="
+ token W2B_H3_R: "(?u)==="
+ token W2B_H4_L: "(?u)===="
+ token W2B_H4_R: "(?u)===="
+ token W2B_ENDFRAME: "(?u)\\[frame\\]>"
+ token OVERLAY_SPEC_SIMPLE: "(?u)[0-9-, \\t]+"
+ token W2B_LISTBLOCK_BEGIN: "(?u)\\r?\\n?[\\*#]+"
+ token W2B_ESC_EXCLM: "(?u)\\\\!"
+ token W2B_ALERT_L: "(?u)!"
+ token W2B_ALERT_R: "(?u)!"
+ token W2B_ALERT_IN: "(?u)[^!\\r\\n]*"
+ token W2B_BOLD_L: "(?u)'''"
+ token W2B_BOLD_R: "(?u)'''"
+ token W2B_BOLD_IN: "(?u)[^'\\r\\n]*"
+ token W2B_ITALIC_L: "(?u)''"
+ token W2B_ITALIC_R: "(?u)''"
+ token W2B_ITALIC_IN: "(?u)[^'\\r\\n]*"
+ token W2B_ESC_AT: "(?u)\\\\@"
+ token W2B_TEXTTT_L: "(?u)@"
+ token W2B_TEXTTT_R: "(?u)@"
+ token W2B_TEXTTT_IN: "(?u)[^@\\r\\n]*"
+ token W2B_TEXTCOLOR_L: "(?u)_"
+ token W2B_TEXTCOLOR_COLOR: "(?u)[^_\\r\\n]*"
+ token W2B_TEXTCOLOR_MID: "(?u)_"
+ token W2B_TEXTCOLOR_IN: "(?u)[^_\\r\\n]*"
+ token W2B_TEXTCOLOR_R: "(?u)_"
token W2B_VSPACE_L: "--"
token W2B_VSPACE_R: "--"
rule document:
{{ result = [] }}
- #[[(SPACE|NEWLINE)*] w2b_autotemplate {{result.append(w2b_autotemplate)}}]
- (
- PARBREAK {{result.append(('PARBREAK', PARBREAK))}}
- | paragraph {{result.append(paragraph)}}
- )*
+ par_set {{ result.append(par_set) }}
END {{result.append(('END',))}}
{{return result}}
+ rule par_set:
+ {{ result = [] }}
+ [ NEWLINE {{ result.append(('NEWLINE', NEWLINE)) }} ]
+ (
+ par_break {{ result.append(par_break) }}
+ | par {{ result.append(par) }}
+ | w2b_env_multi_line {{ result.append(w2b_env_multi_line) }}
+ )+
+ [ NEWLINE {{ result.append(('NEWLINE', NEWLINE)) }} ]
+ {{ return ('par_set', result) }}
- rule paragraph:
+ rule par_break:
+ {{ result = [] }}
+ PARBREAK {{ result.append(('PARBREAK', PARBREAK)) }}
+ (
+ NEWLINE {{ result.append(('NEWLINE', NEWLINE)) }}
+ )*
+ {{ return ('par_break', result) }}
+
+ rule par:
{{result = None}}
(
- w2b_textblock {{result = w2b_textblock}}
- | w2b_listblock {{result = w2b_listblock}}
+ w2b_textblock {{ result = w2b_textblock }}
+ | w2b_listblock {{ result = w2b_listblock }}
)
- {{return ('PARAGRAPH', result)}}
+ {{return ('par', result)}}
+
+ rule w2b_env_multi_line:
+ {{ env_params = [] }}
+ w2b_env_open {{ env_params.append(w2b_env_open) }}
+ par_set {{ env_params.append(par_set) }}
+ w2b_env_close {{ env_params.append(w2b_env_close) }}
+ {{ return match_env('w2b_env_multi_line', env_params[0], env_params[1], env_params[2]) }}
+
+ rule w2b_env_open:
+ W2B_ENV_OPEN_L W2B_ENV_NAME W2B_ENV_OPEN_R
+ {{return ('w2b_env_open', [W2B_ENV_OPEN_L, W2B_ENV_NAME, W2B_ENV_OPEN_R] ) }}
+
+ rule w2b_env_close:
+ W2B_ENV_CLOSE_L W2B_ENV_NAME W2B_ENV_CLOSE_R
+ {{return ('w2b_env_close', [W2B_ENV_CLOSE_L, W2B_ENV_NAME, W2B_ENV_CLOSE_R] ) }}
rule w2b_nowiki:
W2B_NOWIKI_OPEN W2B_NOWIKI_IN W2B_NOWIKI_CLOSE
{{return ('W2B_NOWIKI', W2B_NOWIKI_IN)}}
- #TODO replace W2B_AUTOTEMPLATE_IN with a ruleset
+ #TODO replace W2B_AUTOTEMPLATE_IN with a production rule
rule w2b_autotemplate:
W2B_AUTOTEMPLATE_OPEN W2B_AUTOTEMPLATE_IN W2B_AUTOTEMPLATE_CLOSE
{{return ('AUTOTEMPLATE', W2B_AUTOTEMPLATE_IN)}}
w2b_single_line {{result.append(w2b_single_line)}}
[NEWLINE {{result.append(('NEWLINE', NEWLINE))}}]
)+
- {{return ('W2B_TEXTBLOCK', result)}}
+ {{return ('w2b_textblock', result)}}
rule w2b_listblock:
+ {{ result = [] }}
+ (
+ w2b_listitem {{ result.append(w2b_listitem) }}
+ )+
+ {{ return ('w2b_listblock', result) }}
+
+ rule w2b_listitem:
{{ result = [] }}
(
W2B_LISTBLOCK_BEGIN {{ result.append(('W2B_LISTBLOCK_BEGIN', W2B_LISTBLOCK_BEGIN)) }}
[ overlay_spec {{ result.append(overlay_spec) }} ]
[ SPACE {{ result.append(('SPACE', SPACE)) }} ]
- [ w2b_single_line {{ result.append( w2b_single_line ) }} ]
- )+
- {{ return ('W2B_LIST_BLOCK', result) }}
-
+ [ w2b_single_line_with_env {{ result.append( w2b_single_line_with_env ) }} ]
+ )
+ {{ return ('w2b_listitem', result) }}
+
rule w2b_single_line:
{{ result = [] }}
-
(
w2b_single_line_simple {{ result.append(w2b_single_line_simple) }}
[ PUNCT_SPECIAL {{ result.append(('PUNCT_SPECIAL', PUNCT_SPECIAL)) }} ]
)+
- {{return ('W2B_SINGLE_LINE', result)}}
+ {{return ('w2b_single_line', result)}}
rule w2b_single_line_simple:
{{ result = [] }}
| w2b_text_textcolor {{ result.append(w2b_text_textcolor) }}
| w2b_vspace {{ result.append(w2b_vspace) }}
| w2b_vspacestar {{ result.append(w2b_vspacestar) }}
- | w2b_env_single_line {{ result.append(w2b_env_single_line) }}
- #| w2b_env_open {{ result.append(w2b_env_open) }}
- #| w2b_env_close {{ result.append(w2b_env_close) }}
- #| w2b_nowiki {{ result.append(w2b_nowiki) }}
| PUNCT {{ result.append(('PUNCT', PUNCT)) }}
)+
- {{return ('W2B_SINGLE_LINE', result)}}
+ {{return ('w2b_single_line_simple', result)}}
+ rule w2b_single_line_with_env:
+ {{ result = [] }}
+ (
+ w2b_single_line {{ result.append(w2b_single_line) }}
+ | w2b_env_single_line {{ result.append(w2b_env_single_line) }}
+ )+
+ {{ return ('w2b_single_line_with_env', result) }}
- #TODO environment open/close matching during parsing
rule w2b_env_single_line:
- {{ env_match_params = [None, None, None] }}
- W2B_ENV_OPEN_L [SPACE] W2B_ENV_NAME [SPACE] W2B_ENV_OPEN_R {{ env_match_params[0] = W2B_ENV_NAME }}
- w2b_single_line_simple {{ env_match_params[1] = w2b_single_line_simple }}
- W2B_ENV_CLOSE_L [SPACE] W2B_ENV_NAME [SPACE] W2B_ENV_CLOSE_R {{ env_match_params[2] = W2B_ENV_NAME }}
- {{ return ('W2B_ENV_SINGLE_LINE', match_env(*env_match_params)) }}
+ {{ env_params = [] }}
+ w2b_env_open {{ env_params.append(w2b_env_open) }}
+ w2b_single_line {{ env_params.append(w2b_single_line) }}
+ w2b_env_close {{ env_params.append(w2b_env_close) }}
+ {{ return match_env('w2b_env_single_line', env_params[0], env_params[1], env_params[2]) }}
- rule w2b_env_open:
- W2B_ENV_OPEN_L [SPACE] W2B_ENV_NAME [SPACE] W2B_ENV_OPEN_R
- {{return ('W2B_ENV_OPEN', W2B_ENV_NAME) }}
-
- rule w2b_env_close:
- W2B_ENV_CLOSE_L [SPACE] W2B_ENV_NAME [SPACE] W2B_ENV_CLOSE_R
- {{return ('W2B_ENV_CLOSE', W2B_ENV_NAME) }}
-
rule w2b_escape_seq:
{{ result = None }}
(
)
{{ return result }}
- #TODO hack parsing of typesetting contents outside of the grammar
+ #TODO parsing of typesetting contents
rule w2b_text_alert:
W2B_ALERT_L W2B_ALERT_IN W2B_ALERT_R
- {{ return ('W2B_ALERT', W2B_ALERT_IN) }}
+ {{ return ('w2b_text_alert', W2B_ALERT_IN) }}
rule w2b_text_bold:
W2B_BOLD_L W2B_BOLD_IN W2B_BOLD_R
- {{ return ('W2B_BOLD', W2B_BOLD_IN) }}
+ {{ return ('w2b_text_bold', W2B_BOLD_IN) }}
rule w2b_text_italic:
W2B_ITALIC_L W2B_ITALIC_IN W2B_ITALIC_R
- {{ return ('W2B_ITALIC', W2B_ITALIC_IN) }}
+ {{ return ('w2b_text_italic', W2B_ITALIC_IN) }}
rule w2b_text_texttt:
W2B_TEXTTT_L W2B_TEXTTT_IN W2B_TEXTTT_R
- {{ return ('W2B_TEXTTT', W2B_TEXTTT_IN) }}
+ {{ return ('w2b_text_texttt', W2B_TEXTTT_IN) }}
rule w2b_text_textcolor:
W2B_TEXTCOLOR_L W2B_TEXTCOLOR_COLOR W2B_TEXTCOLOR_MID W2B_TEXTCOLOR_IN W2B_TEXTCOLOR_R
- {{return ('W2B_TEXTCOLOR', W2B_TEXTCOLOR_COLOR, W2B_TEXTCOLOR_IN)}}
+ {{return ('w2b_text_textcolor', W2B_TEXTCOLOR_COLOR, W2B_TEXTCOLOR_IN)}}
rule w2b_vspace:
W2B_VSPACE_L W2B_VSPACE_IN W2B_VSPACE_R
- {{ return ('W2B_VSPACE', W2B_VSPACE_IN) }}
+ {{ return ('w2b_vspace', W2B_VSPACE_IN) }}
rule w2b_vspacestar:
W2B_VSPACESTAR_L W2B_VSPACE_IN W2B_VSPACE_R
- {{ return ('W2B_VSPACESTAR', W2B_VSPACE_IN) }}
+ {{ return ('w2b_vspacestar', W2B_VSPACE_IN) }}
rule overlay_spec:
BRACKET_ANGLE_L OVERLAY_SPEC_SIMPLE BRACKET_ANGLE_R
- {{return ('OVERLAY_SPEC', OVERLAY_SPEC_SIMPLE)}}
+ {{return ('overlay_spec', [BRACKET_ANGLE_L, OVERLAY_SPEC_SIMPLE, BRACKET_ANGLE_R] ) }}