diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index bd6a462..f8c9dda 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -17,9 +17,20 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + - name: Install textparser development and testing dependencies + run: | + pip install --upgrade pip + pip install . + pip install .[test] - name: Test run: | python -m unittest + - name: Static type checking (mypy) + run: | + python -m mypy --strict textparser.py tests + - name: Linting (ruff) + run: | + ruff check textparser.py tests release: needs: [test] diff --git a/py.typed b/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/setup.py b/setup.py index e7ed7a5..8f321e8 100755 --- a/setup.py +++ b/setup.py @@ -26,4 +26,10 @@ def find_version(): url='https://github.com/eerimoq/textparser', py_modules=['textparser'], python_requires='>=3.10', + extras_require={ + "test": [ + "mypy >= 2.1", + "ruff >= 0.15.12", + ], + }, test_suite="tests") diff --git a/tests/test_textparser.py b/tests/test_textparser.py index 0af6c8b..d4d2f4c 100644 --- a/tests/test_textparser.py +++ b/tests/test_textparser.py @@ -1,9 +1,14 @@ +import collections import pickle import unittest from collections import namedtuple +from typing import cast + import textparser +from textparser import MatchObject from textparser import Grammar +from textparser import Pattern from textparser import Sequence from textparser import Choice from textparser import choice @@ -14,8 +19,10 @@ from textparser import OneOrMoreDict from textparser import DelimitedList from textparser import Token +from textparser import _Tokens from textparser import TokenizeError from textparser import tokenize_init +from textparser import _Mismatch from textparser import Any from textparser import AnyUntil from textparser import Optional @@ -27,8 +34,19 @@ from textparser import markup_line from textparser import replace_blocks +# list of tuples containing the arguments for the Token class. Used to +# create a list of Token objects. +TokenizeItems = list[tuple[str,str]|tuple[str,str,int]] + +# Specify the tree of tokens and the expected match result for a given +# grammar +GrammarMatchSpec = tuple[TokenizeItems, MatchObject] + +# Specify the tree of tokens and the line number where the grammar +# is supposed to not match the token tree +GrammarMismatchSpec = tuple[TokenizeItems, int] -def tokenize(items, add_eof_token=True): +def tokenize(items: TokenizeItems, add_eof_token: bool=True) -> list[Token]: tokens = [] for item in items: @@ -47,30 +65,30 @@ def tokenize(items, add_eof_token=True): class TextParserTest(unittest.TestCase): - def parse_and_assert_tree(self, grammar, datas): - for tokens, expected_tree in datas: - tree = grammar.parse(tokenize(tokens)) - self.assertEqual(tree, expected_tree) + def parse_and_assert_tree(self, grammar: Grammar, test_specs: list[GrammarMatchSpec]) -> None: + for token_items, expected_tree in test_specs: + token_tree = grammar.parse(tokenize(token_items)) + self.assertEqual(token_tree, expected_tree) - def parse_and_assert_mismatch(self, grammar, datas): - for tokens, line in datas: - tokens = tokenize(tokens) + def parse_and_assert_mismatch(self, grammar: Grammar, test_specs: list[GrammarMismatchSpec]) -> None: + for token_items, line in test_specs: + token_tree = tokenize(token_items) with self.assertRaises(textparser.GrammarError) as cm: - grammar.parse(tokens) + grammar.parse(token_tree) self.assertEqual(cm.exception.offset, line) - def test_grammar_sequence(self): + def test_grammar_sequence(self) -> None: grammar = Grammar(Sequence('NUMBER', 'WORD')) tokens = tokenize([ ('NUMBER', '1.45'), ('WORD', 'm') ]) - tree = grammar.parse(tokens) - self.assertEqual(tree, ['1.45', 'm']) + match_object = grammar.parse(tokens) + self.assertEqual(match_object, ['1.45', 'm']) - def test_grammar_sequence_mismatch(self): + def test_grammar_sequence_mismatch(self) -> None: grammar = Grammar(Sequence('NUMBER', 'WORD')) tokens = tokenize([('NUMBER', '1.45')]) @@ -79,10 +97,10 @@ def test_grammar_sequence_mismatch(self): self.assertEqual(cm.exception.offset, -1) - def test_grammar_choice(self): + def test_grammar_choice(self) -> None: grammar = Grammar(Choice('NUMBER', 'WORD')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('WORD', 'm')], 'm' @@ -95,18 +113,18 @@ def test_grammar_choice(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_choice_mismatch(self): + def test_grammar_choice_mismatch(self) -> None: grammar = Grammar(Choice(Sequence('NUMBER', 'WORD'), 'WORD')) - datas = [ + datas: list[GrammarMismatchSpec] = [ ([('NUMBER', '1', 5)], -1), ([('NUMBER', '1', 5), ('NUMBER', '2', 7)], 7) ] self.parse_and_assert_mismatch(grammar, datas) - def test_grammar_choice_dict(self): + def test_grammar_choice_dict(self) -> None: number = Forward() number <<= Sequence('NUMBER') grammar = Grammar(ChoiceDict(number, @@ -114,10 +132,13 @@ def test_grammar_choice_dict(self): ChoiceDict('BAR'), 'FIE')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('WORD', 'm')], - ('foo', ['m']) + # the cast is necessary because mypy does not + # recognize (str, MatchObject) tuples as MatchObject, + # even though it should... + cast(MatchObject, ('foo', ['m'])) ), ( [('NUMBER', '5')], @@ -135,18 +156,18 @@ def test_grammar_choice_dict(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_choice_dict_mismatch(self): + def test_grammar_choice_dict_mismatch(self) -> None: grammar = Grammar(ChoiceDict(Sequence('NUMBER'), Sequence('WORD'))) tokens = tokenize([(',', ',', 3)]) - with self.assertRaises(textparser.Error) as cm: + with self.assertRaises(textparser.GrammarError) as cm: grammar.parse(tokens) self.assertEqual(cm.exception.offset, 3) - def test_grammar_choice_dict_init(self): - datas = [ + def test_grammar_choice_dict_init(self) -> None: + datas: list[tuple[collections.abc.Sequence[Pattern|str], str]] = [ ( ('WORD', 'WORD'), "First token kind must be unique, but WORD isn't." @@ -167,10 +188,10 @@ def test_grammar_choice_dict_init(self): self.assertEqual(str(cm.exception), message) - def test_grammar_delimited_list(self): + def test_grammar_delimited_list(self) -> None: grammar = Grammar(Sequence(DelimitedList('WORD'), Optional('.'))) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('WORD', 'foo')], [['foo'], []] @@ -187,10 +208,10 @@ def test_grammar_delimited_list(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_delimited_list_mismatch(self): + def test_grammar_delimited_list_mismatch(self) -> None: grammar = Grammar(Sequence(DelimitedList('WORD'), Optional('.'))) - datas = [ + datas: list[GrammarMismatchSpec] = [ ( [ ('WORD', 'foo', 1), @@ -212,10 +233,10 @@ def test_grammar_delimited_list_mismatch(self): self.parse_and_assert_mismatch(grammar, datas) - def test_grammar_zero_or_more(self): + def test_grammar_zero_or_more(self) -> None: grammar = Grammar(ZeroOrMore('WORD')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [], [] @@ -232,11 +253,11 @@ def test_grammar_zero_or_more(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_zero_or_more_partial_element_match(self): + def test_grammar_zero_or_more_partial_element_match(self) -> None: grammar = Grammar(Sequence( ZeroOrMore(Sequence('WORD', 'NUMBER')), 'WORD')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [ ('WORD', 'foo'), @@ -250,10 +271,10 @@ def test_grammar_zero_or_more_partial_element_match(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_zero_or_more_dict(self): + def test_grammar_zero_or_more_dict(self) -> None: grammar = Grammar(ZeroOrMoreDict(Sequence('WORD', 'NUMBER'))) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [], {} @@ -271,10 +292,10 @@ def test_grammar_zero_or_more_dict(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_one_or_more(self): + def test_grammar_one_or_more(self) -> None: grammar = Grammar(OneOrMore('WORD')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('WORD', 'foo')], ['foo'] @@ -287,10 +308,10 @@ def test_grammar_one_or_more(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_one_or_more_mismatch(self): + def test_grammar_one_or_more_mismatch(self) -> None: grammar = Grammar(OneOrMore('WORD')) - datas = [ + datas = cast(list[GrammarMismatchSpec], [ ( [] , -1 @@ -299,14 +320,14 @@ def test_grammar_one_or_more_mismatch(self): [('NUMBER', 'foo', 2)], 2 ) - ] + ]) self.parse_and_assert_mismatch(grammar, datas) - def test_grammar_one_or_more_dict(self): + def test_grammar_one_or_more_dict(self) -> None: grammar = Grammar(OneOrMoreDict(Sequence('WORD', 'NUMBER'))) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('WORD', 'foo'), ('NUMBER', '1')], { @@ -326,10 +347,10 @@ def test_grammar_one_or_more_dict(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_one_or_more_dict_mismatch(self): + def test_grammar_one_or_more_dict_mismatch(self) -> None: grammar = Grammar(OneOrMoreDict(Sequence('WORD', 'NUMBER'))) - datas = [ + datas = cast(list[GrammarMismatchSpec], [ ( [('WORD', 'foo', 5)], -1 @@ -350,14 +371,14 @@ def test_grammar_one_or_more_dict_mismatch(self): ], 8 ) - ] + ]) self.parse_and_assert_mismatch(grammar, datas) - def test_grammar_any(self): + def test_grammar_any(self) -> None: grammar = Grammar(Any()) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('A', r'a')], 'a' @@ -370,10 +391,10 @@ def test_grammar_any(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_any_until(self): + def test_grammar_any_until(self) -> None: grammar = Grammar(Sequence(AnyUntil('STRING'), 'STRING')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('NUMBER', '1'), ('WORD', 'a'), @@ -384,12 +405,12 @@ def test_grammar_any_until(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_any_until_sequence(self): + def test_grammar_any_until_sequence(self) -> None: grammar = Grammar(Sequence(AnyUntil(Sequence('WORD', 'STRING')), 'WORD', 'STRING')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('NUMBER', '1'), ('WORD', 'a'), @@ -401,7 +422,7 @@ def test_grammar_any_until_sequence(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_1(self): + def test_grammar_1(self) -> None: grammar = Grammar(Sequence( 'IF', choice(Sequence(choice('A', 'B'), 'STRING'), @@ -412,7 +433,7 @@ def test_grammar_1(self): choice(DelimitedList('STRING'), ZeroOrMore('NUMBER')), '.'), '.'))) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [ ('IF', 'IF'), @@ -437,7 +458,7 @@ def test_grammar_1(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_1_mismatch(self): + def test_grammar_1_mismatch(self) -> None: grammar = Grammar(Sequence( 'IF', choice(Sequence(choice('A', 'B'), 'STRING'), @@ -448,7 +469,7 @@ def test_grammar_1_mismatch(self): choice(DelimitedList('STRING'), ZeroOrMore('NUMBER')), '.'), '.'))) - datas = [ + datas = cast(list[GrammarMismatchSpec], [ ( [ ('IF', 'IF', 1), @@ -483,16 +504,16 @@ def test_grammar_1_mismatch(self): ], 5 ) - ] + ]) self.parse_and_assert_mismatch(grammar, datas) - def test_grammar_forward(self): + def test_grammar_forward(self) -> None: foo = Forward() foo <<= Sequence('FOO') grammar = Grammar(foo) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('FOO', 'foo')], ['foo'] @@ -501,12 +522,12 @@ def test_grammar_forward(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_forward_text(self): + def test_grammar_forward_text(self) -> None: foo = Forward() foo <<= 'FOO' grammar = Grammar(foo) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('FOO', 'foo')], 'foo' @@ -515,12 +536,12 @@ def test_grammar_forward_text(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_optional(self): + def test_grammar_optional(self) -> None: grammar = Grammar(Sequence(Optional('WORD'), Optional('WORD'), Optional('NUMBER'))) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [], [[], [], []] @@ -545,33 +566,33 @@ def test_grammar_optional(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_tag(self): + def test_grammar_tag(self) -> None: grammar = Grammar(Tag('a', Tag('b', choice(Tag('c', 'WORD'), Tag('d', Optional('NUMBER')))))) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('WORD', 'bar')], - ('a', ('b', ('c', 'bar'))) + cast(MatchObject, ('a', ('b', ('c', 'bar')))) ), ( [('NUMBER', '1')], - ('a', ('b', ('d', ['1']))) + cast(MatchObject, ('a', ('b', ('d', ['1'])))) ), ( [], - ('a', ('b', ('d', []))) + cast(MatchObject, ('a', ('b', ('d', [])))) ) ] self.parse_and_assert_tree(grammar, datas) - def test_grammar_tag_mismatch(self): + def test_grammar_tag_mismatch(self) -> None: grammar = Grammar(Tag('a', 'WORD')) - datas = [ + datas: list[GrammarMismatchSpec] = [ ( [('NUMBER', 'bar')], 1 @@ -580,10 +601,10 @@ def test_grammar_tag_mismatch(self): self.parse_and_assert_mismatch(grammar, datas) - def test_grammar_and(self): + def test_grammar_and(self) -> None: grammar = Grammar(Sequence(And('NUMBER'), 'NUMBER')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('NUMBER', '1')], [[], '1'] @@ -592,10 +613,10 @@ def test_grammar_and(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_and_mismatch(self): + def test_grammar_and_mismatch(self) -> None: grammar = Grammar(Sequence(And('NUMBER'), 'NUMBER')) - datas = [ + datas: list[GrammarMismatchSpec] = [ ( [('WORD', 'foo', 3), ('NUMBER', '1', 4)], 3 @@ -604,10 +625,10 @@ def test_grammar_and_mismatch(self): self.parse_and_assert_mismatch(grammar, datas) - def test_grammar_not(self): + def test_grammar_not(self) -> None: grammar = Grammar(Sequence(Not('WORD'), 'NUMBER')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('NUMBER', '1')], [[], '1'] @@ -616,10 +637,10 @@ def test_grammar_not(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_not_mismatch(self): + def test_grammar_not_mismatch(self) -> None: grammar = Grammar(Sequence(Not('WORD'), 'NUMBER')) - datas = [ + datas: list[GrammarMismatchSpec] = [ ( [('WORD', 'foo', 3), ('NUMBER', '1', 4)], 3 @@ -628,10 +649,10 @@ def test_grammar_not_mismatch(self): self.parse_and_assert_mismatch(grammar, datas) - def test_grammar_no_match(self): + def test_grammar_no_match(self) -> None: grammar = Grammar(NoMatch()) - datas = [ + datas: list[GrammarMismatchSpec] = [ ( [('NUMBER', '1', 3)], 3 @@ -644,20 +665,20 @@ def test_grammar_no_match(self): self.parse_and_assert_mismatch(grammar, datas) - def test_parse_start_and_end_of_file(self): + def test_parse_start_and_end_of_file(self) -> None: class Parser(textparser.Parser): - def grammar(self): - return Sequence('__SOF__', '__EOF__') + def grammar(self) -> Grammar: + return Grammar(Sequence('__SOF__', '__EOF__')) self.assertEqual(Parser().parse('', match_sof=True), ['__SOF__', '__EOF__']) - def test_parse_start_of_file_mismatch(self): + def test_parse_start_of_file_mismatch(self) -> None: class Parser(textparser.Parser): - def grammar(self): - return Sequence('__EOF__') + def grammar(self) -> Grammar: + return Grammar(Sequence('__EOF__')) with self.assertRaises(textparser.ParseError) as cm: Parser().parse('123', match_sof=True) @@ -665,43 +686,46 @@ def grammar(self): self.assertEqual(str(cm.exception), 'Invalid syntax at line 1, column 1: ">>!<<123"') - def test_parse_end_of_file(self): + def test_parse_end_of_file(self) -> None: class Parser(textparser.Parser): - def grammar(self): - return '__EOF__' + def grammar(self) -> Grammar: + return Grammar('__EOF__') self.assertEqual(Parser().parse('', match_sof=False), '__EOF__') - def test_grammar_none(self): + def test_grammar_none(self) -> None: class AnyAsNone(textparser.Pattern): - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject|_Mismatch: tokens.get_value() - return None + # the cast is a bit hacky because Pattern.match() is + # not supposed to return None. (this should possibly + # return textparser.MISMATCH) + return cast(MatchObject, None) grammar = Grammar(AnyAsNone()) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('NUMBER', '1')], - None + cast(MatchObject, None) ) ] self.parse_and_assert_tree(grammar, datas) - def test_grammar_error(self): + def test_grammar_error(self) -> None: grammar = Grammar(NoMatch()) - datas = [ + datas: list[list[tuple[str, str]|tuple[str, str, int]]] = [ [('NUMBER', '1', 3)], [('WORD', 'foo', 3)] ] - for tokens in datas: - tokens = tokenize(tokens) + for token_args in datas: + tokens = tokenize(token_args) with self.assertRaises(textparser.GrammarError) as cm: grammar.parse(tokens) @@ -710,8 +734,9 @@ def test_grammar_error(self): self.assertEqual(str(cm.exception), 'Invalid syntax at offset 3.') - def test_tokenize_error(self): - datas = [ + def test_tokenize_error(self) -> None: + # list of (offset, text, message) tuples + datas: list[tuple[int, str, str]] = [ (2, 'hej', 'Invalid syntax at line 1, column 3: "he>>!<>!<>!<<"'), @@ -726,8 +751,9 @@ def test_tokenize_error(self): self.assertEqual(cm.exception.offset, offset) self.assertEqual(str(cm.exception), message) - def test_create_token_re(self): - datas = [ + def test_create_token_re(self) -> None: + # list of (TokenTree, expected_regex) tuples + datas: list[tuple[TokenizeItems, str]] = [ ( [('A', r'a')], '(?Pa)' @@ -744,17 +770,17 @@ def test_create_token_re(self): [Token(kind='__SOF__', value='__SOF__', offset=0)]) self.assertEqual(re_token, expected_re_token) - def test_parser(self): + def test_parser(self) -> None: class Parser(textparser.Parser): - def keywords(self): + def keywords(self) -> set[str]: return set([ 'IF', 'A', 'B' ]) - def token_specs(self): + def token_specs(self) -> list[tuple[str, str]|tuple[str,str,str]]: return [ ('SKIP', r'[ \r\n\t]+'), ('NUMBER', r'-?\d+(\.\d+)?([eE][+-]?\d+)?'), @@ -764,17 +790,17 @@ def token_specs(self): ('MISMATCH', r'.') ] - def grammar(self): - return Sequence( + def grammar(self) -> Grammar: + return Grammar(Sequence( 'IF', Optional(choice('A', 'B')), 'ESCAPED_STRING', 'WORD', Optional(choice(DelimitedList('ESCAPED_STRING'), ZeroOrMore('NUMBER'))), - '.') + '.')) - datas = [ + datas: list[tuple[str, MatchObject, MatchObject]] = [ ( 'IF "foo" bar .', ['IF', [], '"foo"', 'bar', [[]], '.'], @@ -814,10 +840,10 @@ def grammar(self): tree = Parser().parse(text, token_tree=True) self.assertEqual(tree, expected_token_tree) - def test_parser_default_keywords(self): + def test_parser_default_keywords(self) -> None: class Parser(textparser.Parser): - def token_specs(self): + def token_specs(self) -> list[tuple[str, str]|tuple[str,str,str]]: return [ ('SKIP', r'[ \r\n\t]+'), ('NUMBER', r'-?\d+(\.\d+)?([eE][+-]?\d+)?'), @@ -827,17 +853,18 @@ def token_specs(self): ('MISMATCH', r'.') ] - def grammar(self): - return Sequence( + def grammar(self) -> Grammar: + return Grammar(Sequence( 'WORD', Optional('WORD'), 'ESCAPED_STRING', 'WORD', Optional(choice(DelimitedList('ESCAPED_STRING'), ZeroOrMore('NUMBER'))), - '.') + '.')) - datas = [ + # list of (input_string, expected_flat_match, expected_tree_match) tuples + datas: list[tuple[str, MatchObject, MatchObject]] = [ ( 'IF "foo" bar .', ['IF', [], '"foo"', 'bar', [[]], '.'], @@ -877,7 +904,7 @@ def grammar(self): tree = Parser().parse(text, token_tree=True) self.assertEqual(tree, expected_token_tree) - def test_parser_bare(self): + def test_parser_bare(self) -> None: class Parser(textparser.Parser): pass @@ -887,26 +914,26 @@ class Parser(textparser.Parser): self.assertEqual(str(cm.exception), 'No grammar defined.') - def test_parser_default_token_specs(self): + def test_parser_default_token_specs(self) -> None: class Parser(textparser.Parser): - def grammar(self): - return 'WORD' + def grammar(self) -> Grammar: + return Grammar('WORD') tree = Parser().parse('foo') self.assertEqual(tree, 'foo') - def test_parser_tokenize_mismatch(self): + def test_parser_tokenize_mismatch(self) -> None: class Parser(textparser.Parser): - def token_specs(self): + def token_specs(self) -> list[tuple[str, str]|tuple[str,str,str]]: return [ ('SKIP', r'[ \r\n\t]+'), ('NUMBER', r'-?\d+(\.\d+)?([eE][+-]?\d+)?'), ('MISMATCH', r'.') ] - def grammar(self): + def grammar(self) -> Grammar: return Grammar('NUMBER') with self.assertRaises(textparser.ParseError) as cm: @@ -918,17 +945,17 @@ def grammar(self): self.assertEqual(str(cm.exception), 'Invalid syntax at line 2, column 3: "34>>!< None: class Parser(textparser.Parser): - def tokenize(self, _text): + def tokenize(self, _text: str) -> list[Token]: return tokenize([ ('NUMBER', '1.45', 0), ('NUMBER', '2', 5) ]) - def grammar(self): - return Sequence('NUMBER', 'WORD') + def grammar(self) -> Grammar: + return Grammar(Sequence('NUMBER', 'WORD')) with self.assertRaises(textparser.ParseError) as cm: Parser().parse('1.45 2') @@ -939,18 +966,18 @@ def grammar(self): self.assertEqual(str(cm.exception), 'Invalid syntax at line 1, column 6: "1.45 >>!<<2"') - def test_parser_grammar_mismatch_choice_max(self): + def test_parser_grammar_mismatch_choice_max(self) -> None: class Parser(textparser.Parser): - def __init__(self, tokens): + def __init__(self, tokens: TokenizeItems) -> None: self._tokens = tokens - def tokenize(self, _text): + def tokenize(self, _text: str) -> list[Token]: return tokenize(self._tokens, add_eof_token=False) - def grammar(self): - return Choice(Sequence('NUMBER', 'WORD'), - 'WORD') + def grammar(self) -> Grammar: + return Grammar(Choice(Sequence('NUMBER', 'WORD'), + 'WORD')) Data = namedtuple('Data', [ @@ -995,13 +1022,13 @@ def grammar(self): self.assertEqual(cm.exception.column, column) self.assertEqual(str(cm.exception), message) - def test_parse_error(self): + def test_parse_error(self) -> None: class Parser(textparser.Parser): - def tokenize(self, text): + def tokenize(self, text: str) -> list[Token]: raise TokenizeError(text, 5) - def grammar(self): + def grammar(self) -> Grammar: return Grammar(Sequence('NUMBER', 'WORD')) with self.assertRaises(textparser.ParseError) as cm: @@ -1014,7 +1041,7 @@ def grammar(self): self.assertEqual(str(cm.exception), 'Invalid syntax at line 2, column 3: "34>>!<<56"') - def test_markup_line(self): + def test_markup_line(self) -> None: datas = [ (0, '>>!<<0', None), (1, '0>>!<<', None), @@ -1037,7 +1064,7 @@ def test_markup_line(self): self.assertEqual(text, line) - def test_replace_blocks(self): + def test_replace_blocks(self) -> None: datas = [ ('{}', '{}'), ('{{}}', '{ }'), @@ -1049,7 +1076,7 @@ def test_replace_blocks(self): new = replace_blocks(old) self.assertEqual(new, expected) - def test_replace_blocks_start_end(self): + def test_replace_blocks_start_end(self) -> None: datas = [ ('1[a]2[b]3', '1[ ]2[ ]3', '[', ']'), ('1{a}2{b}3', '1{ }2{ }3', '{', '}'), @@ -1061,13 +1088,13 @@ def test_replace_blocks_start_end(self): new = replace_blocks(old, start, end) self.assertEqual(new, expected) - def test_any_zero_or_more(self): + def test_any_zero_or_more(self) -> None: class Parser(textparser.Parser): - def keywords(self): - return ['interesting_group'] + def keywords(self) -> set[str]: + return set(['interesting_group']) - def token_specs(self): + def token_specs(self) -> list[tuple[str,str]|tuple[str,str,str]]: return [ ('SKIP', r'[ \r\n\t]+'), ('WORD', r'[A-Za-z0-9_]+'), @@ -1077,16 +1104,16 @@ def token_specs(self): ('EQUAL', '=', r'='), ] - def grammar(self): + def grammar(self) -> Grammar: interesting_group = textparser.Sequence( 'interesting_group', '{', ZeroOrMore(Sequence('WORD', '=', 'WORD', ';')), '}', ';') - return Sequence(AnyUntil('interesting_group'), - interesting_group, - ZeroOrMore(Any())) + return Grammar(Sequence(AnyUntil('interesting_group'), + interesting_group, + ZeroOrMore(Any()))) text = ''' @@ -1105,6 +1132,7 @@ def grammar(self): ''' tree = Parser().parse(text) + assert isinstance(tree, list) self.assertEqual(tree[1], [ 'interesting_group', @@ -1116,11 +1144,11 @@ def grammar(self): '}', ';']) - def test_error_picklable(self): + def test_error_picklable(self) -> None: class Parser(textparser.Parser): - def grammar(self): - return Sequence('__EOF__') + def grammar(self) -> Grammar: + return Grammar(Sequence('__EOF__')) try: Parser().parse('123', match_sof=True) diff --git a/textparser.py b/textparser.py index 8d76d72..ead0d58 100644 --- a/textparser.py +++ b/textparser.py @@ -1,9 +1,11 @@ # A text parser. import re -from collections import namedtuple -from operator import itemgetter +import collections.abc +import typing +from dataclasses import dataclass +from operator import itemgetter __author__ = 'Erik Moqvist' __version__ = '0.24.0' @@ -18,40 +20,30 @@ class _Mismatch(object): """ - -class _String(object): - """Matches a specific token kind. - - """ - - def __init__(self, kind): - self.kind = kind - - def match(self, tokens): - if self.kind == tokens.peek().kind: - return tokens.get_value() - else: - return MISMATCH - +@dataclass(slots=True) +class Token: + kind: str + value: str|None + offset: int class _Tokens(object): - def __init__(self, tokens): + def __init__(self, tokens: list[Token]): self._tokens = tokens self._pos = 0 self._max_pos = -1 - self._stack = [] + self._stack: list[int] = [] - def get_value(self): + def get_value(self) -> Token|str: pos = self._pos self._pos += 1 return self._tokens[pos] - def peek(self): + def peek(self) -> Token: return self._tokens[self._pos] - def peek_max(self): + def peek_max(self) -> Token: pos = self._pos if self._max_pos > pos: @@ -62,55 +54,82 @@ def peek_max(self): else: return self._tokens[pos] - def save(self): + def save(self) -> None: self._stack.append(self._pos) - def restore(self): + def restore(self) -> None: self._pos = self._stack.pop() - def update(self): + def update(self) -> None: self._stack[-1] = self._pos - def mark_max_restore(self): + def mark_max_restore(self) -> None: if self._pos > self._max_pos: self._max_pos = self._pos self._pos = self._stack.pop() - def mark_max_load(self): + def mark_max_load(self) -> None: if self._pos > self._max_pos: self._max_pos = self._pos self._pos = self._stack[-1] - def drop(self): + def drop(self) -> None: self._stack.pop() - def __repr__(self): + def __repr__(self) -> str: return str(self._tokens[self._pos:self._pos + 2]) +MatchObject = list["MatchObject"]|dict[str, list["MatchObject"]]|tuple[str,"MatchObject"]|Token|str + +class Pattern(object): + """Base class of all patterns. + + """ + + def match(self, tokens: _Tokens) -> MatchObject|_Mismatch: + """Returns :data:`~textparser.MISMATCH` on mismatch, and anything else + on match. + + """ + + raise NotImplementedError('To be implemented by subclasses.') + +class _String(Pattern): + """Matches a specific token kind. + + """ + + def __init__(self, kind: str) -> None: + self.kind = kind + + def match(self, tokens: _Tokens) -> MatchObject|_Mismatch: + if self.kind == tokens.peek().kind: + return tokens.get_value() + else: + return MISMATCH class _StringTokens(_Tokens): - def get_value(self): + def get_value(self) -> Token|str: pos = self._pos self._pos += 1 - return self._tokens[pos].value + return typing.cast(str, self._tokens[pos].value) -def _wrap_string(item): +def _wrap_string(item: Pattern|str) -> Pattern: if isinstance(item, str): item = _String(item) return item - -def _wrap_strings(items): +def _wrap_strings(items: collections.abc.Sequence[Pattern|str]) -> list[Pattern]: return [_wrap_string(item) for item in items] -def _format_invalid_syntax(text, offset): +def _format_invalid_syntax(text: str, offset: int) -> str: return 'Invalid syntax at line {}, column {}: "{}"'.format( line(text, offset), column(text, offset), @@ -131,14 +150,14 @@ class TokenizeError(Error): """ - def __init__(self, text, offset): + def __init__(self, text: str, offset: int) -> None: self._text = text self._offset = offset message = _format_invalid_syntax(text, offset) super(TokenizeError, self).__init__(message) @property - def text(self): + def text(self) -> str: """The input text to the tokenizer. """ @@ -146,7 +165,7 @@ def text(self): return self._text @property - def offset(self): + def offset(self) -> int: """Offset into the text where the tokenizer failed. """ @@ -160,13 +179,13 @@ class GrammarError(Error): """ - def __init__(self, offset): + def __init__(self, offset: int) -> None: self._offset = offset message = 'Invalid syntax at offset {}.'.format(offset) super(GrammarError, self).__init__(message) @property - def offset(self): + def offset(self) -> int: """Offset into the text where the parser failed. """ @@ -179,7 +198,7 @@ class ParseError(Error): """ - def __init__(self, text, offset): + def __init__(self, text: str, offset: int): self._text = text self._offset = offset self._line = line(text, offset) @@ -188,7 +207,7 @@ def __init__(self, text, offset): super(ParseError, self).__init__(message) @property - def text(self): + def text(self) -> str: """The input text to the parser. """ @@ -196,7 +215,7 @@ def text(self): return self._text @property - def offset(self): + def offset(self) -> int: """Offset into the text where the parser failed. """ @@ -204,7 +223,7 @@ def offset(self): return self._offset @property - def line(self): + def line(self) -> int: """Line where the parser failed. """ @@ -212,50 +231,32 @@ def line(self): return self._line @property - def column(self): + def column(self) -> int: """Column where the parser failed. """ return self._column - def __reduce__(self): + def __reduce__(self) -> tuple[typing.Any, ...]: """Adds pickling support.""" return type(self), (self._text, self._offset), {} - -Token = namedtuple('Token', ['kind', 'value', 'offset']) - - -class Pattern(object): - """Base class of all patterns. - - """ - - def match(self, tokens): - """Returns :data:`~textparser.MISMATCH` on mismatch, and anything else - on match. - - """ - - raise NotImplementedError('To be implemented by subclasses.') - - class Sequence(Pattern): """Matches a sequence of patterns. Becomes a list in the parse tree. """ - def __init__(self, *patterns): + def __init__(self, *patterns: Pattern|str) -> None: self.patterns = _wrap_strings(patterns) - def match(self, tokens): - matched = [] + def match(self, tokens: _Tokens) -> MatchObject|_Mismatch: + matched: list[MatchObject] = [] for pattern in self.patterns: mo = pattern.match(tokens) - if mo is MISMATCH: + if isinstance(mo, _Mismatch): return MISMATCH matched.append(mo) @@ -269,17 +270,17 @@ class Choice(Pattern): """ - def __init__(self, *patterns): + def __init__(self, *patterns: Pattern|str) -> None: self._patterns = _wrap_strings(patterns) - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject|_Mismatch: tokens.save() for pattern in self._patterns: tokens.mark_max_load() mo = pattern.match(tokens) - if mo is not MISMATCH: + if not isinstance(mo, _Mismatch): tokens.drop() return mo @@ -288,6 +289,55 @@ def match(self, tokens): return MISMATCH +class Tag(Pattern): + """Tags any matched `pattern` with name `name`. Becomes a two-tuple of + `name` and match in the parse tree. + + """ + + def __init__(self, name: str, pattern: Pattern|str) -> None: + self._name = name + self._pattern = _wrap_string(pattern) + + @property + def pattern(self) -> Pattern: + return self._pattern + + def match(self, tokens: _Tokens) -> MatchObject|_Mismatch: + mo = self._pattern.match(tokens) + + if not isinstance(mo, _Mismatch): + return (self._name, mo) + else: + return MISMATCH + + +class Forward(Pattern): + """Forward declaration of a pattern. + + .. code-block:: python + + >>> foo = Forward() + >>> foo <<= Sequence('NUMBER') + + """ + + def __init__(self) -> None: + self._pattern: Pattern|None = None + + @property + def pattern(self) -> Pattern|None: + return self._pattern + + def __ilshift__(self, other: Pattern|str) -> "Forward": + self._pattern = _wrap_string(other) + + return self + + def match(self, tokens: _Tokens) -> MatchObject|_Mismatch: + if self._pattern is not None: + return self._pattern.match(tokens) + return MISMATCH class ChoiceDict(Pattern): """Matches any of given patterns. The first token kind of all patterns @@ -299,23 +349,26 @@ class ChoiceDict(Pattern): """ - def __init__(self, *patterns): - self._patterns_map = {} - patterns = _wrap_strings(patterns) + def __init__(self, *patterns: Pattern|str) -> None: + self._patterns_map: dict[str, Pattern] = {} + wrapped_patterns = _wrap_strings(patterns) - for pattern in patterns: + for pattern in wrapped_patterns: self._check_pattern(pattern, pattern) @property - def patterns_map(self): + def patterns_map(self) -> dict[str, Pattern]: return self._patterns_map - def _check_pattern(self, inner, outer): + def _check_pattern(self, inner: Pattern, outer: Pattern) -> None: if isinstance(inner, _String): self._add_pattern(inner.kind, outer) elif isinstance(inner, Sequence): self._check_pattern(inner.patterns[0], outer) elif isinstance(inner, (Tag, Forward)): + if inner.pattern is None: + raise Error( + 'No inner pattern defined for {}.'.format(type(inner))) self._check_pattern(inner.pattern, outer) elif isinstance(inner, ChoiceDict): for pattern in inner.patterns_map.values(): @@ -324,7 +377,7 @@ def _check_pattern(self, inner, outer): raise Error( 'Unsupported pattern type {}.'.format(type(inner))) - def _add_pattern(self, kind, pattern): + def _add_pattern(self, kind: str, pattern: Pattern) -> None: if kind in self._patterns_map: raise Error( "First token kind must be unique, but {} isn't.".format( @@ -332,7 +385,7 @@ def _add_pattern(self, kind, pattern): self._patterns_map[kind] = pattern - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject|_Mismatch: kind = tokens.peek().kind if kind in self._patterns_map: @@ -347,18 +400,18 @@ class Repeated(Pattern): """ - def __init__(self, pattern, minimum=0): + def __init__(self, pattern: Pattern|str, minimum: int=0) -> None: self._pattern = _wrap_string(pattern) self._minimum = minimum - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject|_Mismatch: matched = [] tokens.save() while True: mo = self._pattern.match(tokens) - if mo is MISMATCH: + if isinstance(mo, _Mismatch): tokens.mark_max_restore() break @@ -381,22 +434,22 @@ class RepeatedDict(Repeated): """ - def __init__(self, pattern, minimum=0, key=None): + def __init__(self, pattern: Pattern|str, minimum: int=0, key: typing.Callable[[MatchObject], str]|None=None) -> None: super(RepeatedDict, self).__init__(pattern, minimum) if key is None: - key = itemgetter(0) + key = typing.cast(typing.Callable[[MatchObject], str], itemgetter(0)) self._key = key - def match(self, tokens): - matched = {} + def match(self, tokens: _Tokens) -> MatchObject|_Mismatch: + matched: dict[str, list[MatchObject]] = {} tokens.save() while True: mo = self._pattern.match(tokens) - if mo is MISMATCH: + if isinstance(mo, _Mismatch): tokens.mark_max_restore() break @@ -422,7 +475,7 @@ class ZeroOrMore(Repeated): """ - def __init__(self, pattern): + def __init__(self, pattern: Pattern|str) -> None: super(ZeroOrMore, self).__init__(pattern, 0) @@ -433,7 +486,7 @@ class ZeroOrMoreDict(RepeatedDict): """ - def __init__(self, pattern, key=None): + def __init__(self, pattern: Pattern|str, key: typing.Callable[[MatchObject], str]|None=None) -> None: super(ZeroOrMoreDict, self).__init__(pattern, 0, key) @@ -444,7 +497,7 @@ class OneOrMore(Repeated): """ - def __init__(self, pattern): + def __init__(self, pattern: Pattern|str) -> None: super(OneOrMore, self).__init__(pattern, 1) @@ -455,7 +508,7 @@ class OneOrMoreDict(RepeatedDict): """ - def __init__(self, pattern, key=None): + def __init__(self, pattern: Pattern|str, key: typing.Callable[[MatchObject], str]|None=None) -> None: super(OneOrMoreDict, self).__init__(pattern, 1, key) @@ -466,15 +519,15 @@ class DelimitedList(Pattern): """ - def __init__(self, pattern, delim=','): + def __init__(self, pattern: Pattern|str, delim: str=',') -> None: self._pattern = _wrap_string(pattern) self._delim = _wrap_string(delim) - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject|_Mismatch: # First pattern. mo = self._pattern.match(tokens) - if mo is MISMATCH: + if isinstance(mo, _Mismatch): return MISMATCH matched = [mo] @@ -484,13 +537,13 @@ def match(self, tokens): # Discard the delimiter. mo = self._delim.match(tokens) - if mo is MISMATCH: + if isinstance(mo, _Mismatch): break # Pattern. mo = self._pattern.match(tokens) - if mo is MISMATCH: + if isinstance(mo, _Mismatch): break matched.append(mo) @@ -507,14 +560,14 @@ class Optional(Pattern): """ - def __init__(self, pattern): + def __init__(self, pattern: Pattern|str) -> None: self._pattern = _wrap_string(pattern) - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject|_Mismatch: tokens.save() mo = self._pattern.match(tokens) - if mo is MISMATCH: + if isinstance(mo, _Mismatch): tokens.mark_max_restore() return [] @@ -529,7 +582,7 @@ class Any(Pattern): """ - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject|_Mismatch: if tokens.peek().kind == '__EOF__': return MISMATCH else: @@ -542,17 +595,17 @@ class AnyUntil(Pattern): """ - def __init__(self, pattern): + def __init__(self, pattern: Pattern|str) -> None: self._pattern = _wrap_string(pattern) - def match(self, tokens): - matched = [] + def match(self, tokens: _Tokens) -> MatchObject|_Mismatch: + matched: list[MatchObject] = [] while True: tokens.save() mo = self._pattern.match(tokens) - if mo is not MISMATCH: + if not isinstance(mo, _Mismatch): break tokens.restore() @@ -569,15 +622,15 @@ class And(Pattern): """ - def __init__(self, pattern): + def __init__(self, pattern: Pattern|str) -> None: self._pattern = _wrap_string(pattern) - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject|_Mismatch: tokens.save() mo = self._pattern.match(tokens) tokens.restore() - if mo is MISMATCH: + if isinstance(mo, _Mismatch): return MISMATCH else: return [] @@ -591,15 +644,15 @@ class Not(Pattern): """ - def __init__(self, pattern): + def __init__(self, pattern: Pattern|str) -> None: self._pattern = _wrap_string(pattern) - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject|_Mismatch: tokens.save() mo = self._pattern.match(tokens) tokens.restore() - if mo is MISMATCH: + if isinstance(mo, _Mismatch): return [] else: return MISMATCH @@ -610,85 +663,36 @@ class NoMatch(Pattern): """ - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject|_Mismatch: return MISMATCH - -class Tag(Pattern): - """Tags any matched `pattern` with name `name`. Becomes a two-tuple of - `name` and match in the parse tree. - - """ - - def __init__(self, name, pattern): - self._name = name - self._pattern = _wrap_string(pattern) - - @property - def pattern(self): - return self._pattern - - def match(self, tokens): - mo = self._pattern.match(tokens) - - if mo is not MISMATCH: - return (self._name, mo) - else: - return MISMATCH - - -class Forward(Pattern): - """Forward declaration of a pattern. - - .. code-block:: python - - >>> foo = Forward() - >>> foo <<= Sequence('NUMBER') - - """ - - def __init__(self): - self._pattern = None - - @property - def pattern(self): - return self._pattern - - def __ilshift__(self, other): - self._pattern = _wrap_string(other) - - return self - - def match(self, tokens): - return self._pattern.match(tokens) - - class Grammar(object): """Creates a tree of given tokens using the grammar `grammar`. """ - def __init__(self, grammar): + def __init__(self, grammar: Pattern|str) -> None: + self._root: Pattern if isinstance(grammar, str): - grammar = _wrap_string(grammar) - - self._root = grammar + self._root = _wrap_string(grammar) + else: + self._root = grammar - def parse(self, tokens, token_tree=False): + def parse(self, token_list: list[Token], token_tree: bool=False) -> MatchObject: if token_tree: - tokens = _Tokens(tokens) + tokens = _Tokens(token_list) else: - tokens = _StringTokens(tokens) + tokens = _StringTokens(token_list) parsed = self._root.match(tokens) - if parsed is not MISMATCH and tokens.peek_max().kind == '__EOF__': + if not isinstance(parsed, _Mismatch) and tokens.peek_max().kind == '__EOF__': return parsed else: raise GrammarError(tokens.peek_max().offset) -def choice(*patterns): +def choice(*patterns: Pattern|str) -> Choice|ChoiceDict: """Returns an instance of the fastest choice class for given patterns `patterns`. It is recommended to use this function instead of instantiate :class:`~textparser.Choice` or @@ -702,7 +706,7 @@ def choice(*patterns): return Choice(*patterns) -def markup_line(text, offset, marker='>>!<<'): +def markup_line(text: str, offset: int, marker: str='>>!<<') -> str: """Insert `marker` at `offset` into `text`, and return the marked line. @@ -724,17 +728,17 @@ def markup_line(text, offset, marker='>>!<<'): return text[begin:offset] + marker + text[offset:end] -def line(text, offset): +def line(text: str, offset: int) -> int: return text[:offset].count('\n') + 1 -def column(text, offset): +def column(text: str, offset: int) -> int: line_start = text.rfind('\n', 0, offset) return offset - line_start -def tokenize_init(spec): +def tokenize_init(spec: collections.abc.Sequence[tuple[str, str]|tuple[str, str, int]]) -> tuple[list[Token], str]: """Initialize a tokenizer. Should only be called by the :func:`~textparser.Parser.tokenize` method in the parser. @@ -742,7 +746,7 @@ def tokenize_init(spec): tokens = [Token('__SOF__', '__SOF__', 0)] re_token = '|'.join([ - '(?P<{}>{})'.format(name, regex) for name, regex in spec + '(?P<{}>{})'.format(token_spec[0], token_spec[1]) for token_spec in spec ]) return tokens, re_token @@ -768,7 +772,8 @@ class Parser(object): """ - def _unpack_token_specs(self): + def _unpack_token_specs(self) -> tuple[dict[str, str], + list[tuple[str,str]]]: names = {} specs = [] @@ -781,7 +786,7 @@ def _unpack_token_specs(self): return names, specs - def keywords(self): + def keywords(self) -> set[str]: """A set of keywords in the text. .. code-block:: python @@ -793,7 +798,7 @@ def keywords(self): return set() - def token_specs(self): + def token_specs(self) -> list[tuple[str, str]|tuple[str, str, str]]: """The token specifications with token name, regular expression, and optionally a user friendly name. @@ -813,7 +818,7 @@ def token_specs(self): ('MISMATCH', r'.') ] - def tokenize(self, text): + def tokenize(self, text: str) -> list[Token]: """Tokenize given string `text`, and return a list of tokens. Raises :class:`~textparser.TokenizeError` on failure. @@ -830,6 +835,7 @@ def tokenize(self, text): for mo in re.finditer(re_token, text, re.DOTALL): kind = mo.lastgroup + assert isinstance(kind, str) if kind == 'SKIP': pass @@ -848,7 +854,7 @@ def tokenize(self, text): return tokens - def grammar(self): + def grammar(self) -> Grammar: """The text grammar is used to create a parse tree out of a list of tokens. @@ -858,7 +864,7 @@ def grammar(self): raise NotImplementedError('No grammar defined.') - def parse(self, text, token_tree=False, match_sof=False): + def parse(self, text: str, token_tree: bool=False, match_sof:bool=False) -> _Mismatch|MatchObject: """Parse given string `text` and return the parse tree. Raises :class:`~textparser.ParseError` on failure. @@ -888,12 +894,19 @@ def parse(self, text, token_tree=False, match_sof=False): if len(tokens) > 0 and tokens[0].kind == '__SOF__': del tokens[0] - return Grammar(self.grammar()).parse(tokens, token_tree) + grammar = self.grammar() + if isinstance(grammar, Grammar): + return grammar.parse(tokens, token_tree) + else: + # used for compatibility with old user code from the + # pre-type hints era... + return Grammar(grammar).parse(tokens, token_tree) + except (TokenizeError, GrammarError) as e: raise ParseError(text, e.offset) -def replace_blocks(string, start='{', end='}'): +def replace_blocks(string: str, start: str='{', end: str='}') -> str: """Replace all blocks starting with `start` and ending with `end` with spaces (not including `start` and `end`).