Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
797f157
Initial: add flag types and regexing methods. Global flags still not …
YehudaKLEIN Apr 1, 2023
507971f
Unite all flags to single class, where unset is a parameter (name). S…
YehudaKLEIN Apr 4, 2023
83e573e
add asm tests (and fix a mistake they showed)
YehudaKLEIN Apr 4, 2023
30f0342
add more asm tests
YehudaKLEIN Apr 4, 2023
58594eb
add validation on input to flag
YehudaKLEIN Apr 4, 2023
6c2e804
add validation tests
YehudaKLEIN Apr 4, 2023
d78992c
add validation to insure validity of locale_dependent and unicode aga…
YehudaKLEIN Apr 4, 2023
ce46ec1
Add api tests. Fix asm tests. Fix some logic that was broken.
YehudaKLEIN Apr 5, 2023
43c3443
Reformat comment
YehudaKLEIN Apr 5, 2023
fb2ff5b
add test
YehudaKLEIN Apr 7, 2023
9236c84
Initial: add flag types and regexing methods. Global flags still not …
YehudaKLEIN Apr 1, 2023
92d9c6e
Unite all flags to single class, where unset is a parameter (name). S…
YehudaKLEIN Apr 4, 2023
fa5858c
add asm tests (and fix a mistake they showed)
YehudaKLEIN Apr 4, 2023
12f24ab
add more asm tests
YehudaKLEIN Apr 4, 2023
e44b940
add validation on input to flag
YehudaKLEIN Apr 4, 2023
a690981
add validation tests
YehudaKLEIN Apr 4, 2023
b326621
add validation to insure validity of locale_dependent and unicode aga…
YehudaKLEIN Apr 4, 2023
0534cd3
Add api tests. Fix asm tests. Fix some logic that was broken.
YehudaKLEIN Apr 5, 2023
48a529f
Reformat comment
YehudaKLEIN Apr 5, 2023
38dee25
add test
YehudaKLEIN Apr 7, 2023
09af9a5
Merge remote-tracking branch 'origin/feature/add_inline_flags' into f…
YehudaKLEIN Apr 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ke/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
RegexFlag,
)

from ke import asm

try:
# added in Python 3.11
from re import NOFLAG # type: ignore
Expand Down Expand Up @@ -54,7 +56,9 @@
def re(pattern: AnyStr, flavor: Optional[Flavor] = None) -> AnyStr:
# TODO: LRU cache
if _is_bytes_like(pattern):
asm.InlineFlag.PATTERN_IS_BYTES_LIKE = True
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might be better placed in some global values-per-compilation dictionary somewhere.

return _re(pattern.decode("ascii"), flavor).encode("ascii") # type: ignore
asm.InlineFlag.PATTERN_IS_BYTES_LIKE = False
assert isinstance(pattern, str)
return _re(pattern, flavor)

Expand Down
117 changes: 117 additions & 0 deletions ke/asm.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,123 @@ class NegativeLookbehind(ParensSyntax):
Lookbehind.INVERTED = NegativeLookbehind


class InlineFlag(namedtuple("InlineFlag", ["flag_character", "name", "sub"]), Asm):
A = "a"
L = "L"
U = "u"
I = "i"
M = "m"
S = "s"
regex_to_kleenexp = {
A: "ascii_only",
L: "locale_dependent",
U: "unicode",
I: "ignore_case",
M: "multiline",
S: "any_matches_all",
}
UNSET = "unset"
UNSETTABLE = set([I, M, S])
INCOMPATIBLE = set([A, L, U])
# Unicode flag CANNOT be used in bytes-like,
# locale_dependent can ONLY be used in bytes-like.
# This value is being set before compilation start.
PATTERN_IS_BYTES_LIKE = False

def to_regex(self, flavor, capture_names, wrap=False):
# Here we break the tree structure, using this specialized
# function to create a flagging expression for the rest of
# the subtree. The only flag in a flagging sequence that
# reaches the to_regex() method is the first one in the
# sequence (there may be other sequences). This is necessary
# because the parenthesis wrapping and regex legality are
# dependent on the whole flagging expression.
Copy link
Author

@Yehuda-blip Yehuda-blip Apr 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we drop all the validations, there is always the option to simply compile [ignore_case multiline 'string'] to
(?i:(?m:string)) instead of (?im:string), which should make the whole thing a lot simpler (and I very much doubt has much effect on the performance of the result, if this is even a concern).

return self.flag_sub_expression(flavor, capture_names, wrap)

def flag_sub_expression(self, flavor, capture_names, wrap):
# true_flags is a dictionary instead of a list to allow
# overriding previous flag declarations - for example:
# 'ignore_case ignore_case:unset' should give '(?-i:)'
# rather than '(?i-i:)' (this behaviour might be replaced
# with an error in the future).
true_flags = {self.flag_character: self}
curr = self.sub
while isinstance(curr, InlineFlag):
true_flags[curr.flag_character] = curr
curr = curr.sub
setting = []
unsetting = []
flags = list(true_flags.values())
got_one_incompatible = False
for flag in flags:
self.validate_parameter_is_valid(flag)
self.validate_unset_flag_is_unsettable(flag)
self.validate_bytes_like_against_unicode_or_locale_only(flag)
got_one_incompatible = self.validate_no_incompatibles_together(
flag, got_one_incompatible
)
if flag.name == InlineFlag.UNSET:
unsetting.append(flag)
else:
setting.append(flag)

setting_str = "".join([flag.flag_character for flag in setting])
unsetting_str = (
"-" + "".join([flag.flag_character for flag in unsetting])
if len(unsetting) > 0
else ""
)
return f"(?{setting_str}{unsetting_str}:{curr.to_regex(flavor, capture_names, wrap)})"

def validate_parameter_is_valid(self, flag):
if flag.name not in [InlineFlag.UNSET, None]:
raise CompileError(
f"Unrecognized token passed to flag: "
f"{InlineFlag.regex_to_kleenexp[self.flag_character]} does not accept {self.name}"
)

def validate_unset_flag_is_unsettable(self, flag):
if flag.name == InlineFlag.UNSET:
if flag.flag_character not in InlineFlag.UNSETTABLE:
raise CompileError(
f"Unsetting not supported for this flag: "
f"{InlineFlag.regex_to_kleenexp[self.flag_character]}"
)

def validate_bytes_like_against_unicode_or_locale_only(self, flag):
if InlineFlag.PATTERN_IS_BYTES_LIKE and flag.flag_character == InlineFlag.U:
raise CompileError(
f"Cannot use {InlineFlag.regex_to_kleenexp[flag.flag_character]} flag "
f"with a bytes pattern"
)
if not InlineFlag.PATTERN_IS_BYTES_LIKE and flag.flag_character == InlineFlag.L:
raise CompileError(
f"Can only use {InlineFlag.regex_to_kleenexp[flag.flag_character]} flag "
f"with a bytes pattern"
)

def validate_no_incompatibles_together(self, flag, got_one_incompatible):
"""
Raises CompileError if found second incompatible, otherwise returns
true if found the first.
"""
if flag.flag_character in InlineFlag.INCOMPATIBLE:
if got_one_incompatible:
raise CompileError(
f"{InlineFlag.regex_to_kleenexp[InlineFlag.A]}, "
f"{InlineFlag.regex_to_kleenexp[InlineFlag.L]} and "
f"{InlineFlag.regex_to_kleenexp[InlineFlag.U]} are "
f"incompatible and cannot be used in the same flag "
f"setting expression"
)
else:
return True
return False

def is_empty(self):
return self.sub.is_empty()


class Setting(namedtuple("Setting", ["setting", "sub"]), Asm):
def to_regex(self, flavor, capture_names, wrap=False):
if not self.setting:
Expand Down
12 changes: 12 additions & 0 deletions ke/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,16 @@ def invert_operator(n, expr):
long, short = names.split()
builtin_operators[short] = builtin_operators[long]

inline_flags = {
asm.InlineFlag.regex_to_kleenexp[asm.InlineFlag.A]: asm.InlineFlag.A,
asm.InlineFlag.regex_to_kleenexp[asm.InlineFlag.L]: asm.InlineFlag.L,
asm.InlineFlag.regex_to_kleenexp[asm.InlineFlag.U]: asm.InlineFlag.U,
asm.InlineFlag.regex_to_kleenexp[asm.InlineFlag.I]: asm.InlineFlag.I,
asm.InlineFlag.regex_to_kleenexp[asm.InlineFlag.M]: asm.InlineFlag.M,
asm.InlineFlag.regex_to_kleenexp[asm.InlineFlag.S]: asm.InlineFlag.S,
}
builtin_operators.update(inline_flags)


def compile(ast):
macros = dict(builtin_macros)
Expand Down Expand Up @@ -257,6 +267,8 @@ def compile_operator(o, macros):
max = int(max)

return asm.Multiple(min, max, get_greediness_by_name_of_operator(o.name), sub)
if o.op_name in inline_flags:
return asm.InlineFlag(inline_flags[o.op_name], o.name, sub)
if o.op_name not in builtin_operators:
raise CompileError("Operator %s does not exist" % o.op_name)
return builtin_operators[o.op_name](o.name, sub)
Expand Down
37 changes: 37 additions & 0 deletions ke/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,3 +674,40 @@ def test_no_whitespace():
) == ke.re(
"Hello. My name is [capture:name #tmp ' ' #tmp #tmp=[#uppercase [1+ #lowercase]]]. You killed my ['Father' | 'Mother' | 'Son' | 'Daughter' | 'Dog' | 'Hamster']. Prepare to die."
)


def test_inline_flags():
assert ke.match("[ignore_case 'A']", "a")
assert not ke.match("[ignore_case [ignore_case:unset 'A']]", "a")
assert ke.findall("[multiline #start_line 'a']", "a\na") == ["a", "a"]
# this is weird, but corresponds to regex
assert ke.findall("[multiline #start_line [multiline:unset 'a']]", "a\na") == [
"a",
"a",
]
assert ke.findall("[multiline [multiline:unset #start_line 'a']]", "a\na") == ["a"]
assert ke.match("[any_matches_all #any]", "\n")
assert ke.match("[#digit]", "\u0660")
assert not ke.match("[ascii_only #digit]", "\u0660")
assert ke.re("[unicode 'test']") == r"(?u:test)"
assert ke.re(b"[locale_dependent 'test']") == b"(?L:test)"
with pytest.raises(re.error):
ke.re("[locale_dependent 'test']")
with pytest.raises(re.error):
ke.re(b"[locale_dependent ascii_only 'test']")
with pytest.raises(re.error):
ke.re(b"[unicode 'test']")
with pytest.raises(re.error):
ke.re("[unicode ascii_only 'test']")
assert ke.findall(
"[ignore_case multiline any_matches_all #start_line 'a']", "A\nA\nA"
) == ["A", "A", "A"]
assert ke.findall(
"[ignore_case [multiline [any_matches_all #start_line 'a']]]", "A\nA\nA"
) == ["A", "A", "A"]
assert ke.findall(
"AAA[ignore_case [any_matches_all 'a'#any]'a']AA", "AAAA\naAAAAAA\nAAAaAAA\nAAA"
) == [
"AAAA\naAA",
"AAAA\nAAA",
]
111 changes: 111 additions & 0 deletions ke/tests/test_asm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
START_LINE,
START_STRING,
WORD_BOUNDARY,
InlineFlag,
)
from ke._errors import CompileError

Expand Down Expand Up @@ -125,3 +126,113 @@ def test_boundary():
assert assemble(START_LINE) == r"^"
assert assemble(START_STRING) == r"\A"
assert assemble(WORD_BOUNDARY) == r"\b"


def test_inline_flags():
save_PATTERN_IS_BYTES_LIKE = InlineFlag.PATTERN_IS_BYTES_LIKE
assert assemble(InlineFlag("a", None, Literal("test"))) == r"(?a:test)"
InlineFlag.PATTERN_IS_BYTES_LIKE = True
assert assemble(InlineFlag("L", None, Literal("test"))) == r"(?L:test)"
InlineFlag.PATTERN_IS_BYTES_LIKE = False
assert assemble(InlineFlag("u", None, Literal("test"))) == r"(?u:test)"
assert assemble(InlineFlag("i", None, Literal("test"))) == r"(?i:test)"
assert assemble(InlineFlag("m", None, Literal("test"))) == r"(?m:test)"
assert assemble(InlineFlag("s", None, Literal("test"))) == r"(?s:test)"
assert assemble(InlineFlag("i", "unset", Literal("test"))) == r"(?-i:test)"
assert assemble(InlineFlag("m", "unset", Literal("test"))) == r"(?-m:test)"
assert assemble(InlineFlag("s", "unset", Literal("test"))) == r"(?-s:test)"
assert (
assemble(InlineFlag("s", None, InlineFlag("s", None, Literal("test"))))
== r"(?s:test)"
)
assert (
assemble(InlineFlag("s", "unset", InlineFlag("s", "unset", Literal("test"))))
== r"(?-s:test)"
)
with pytest.raises(CompileError):
assemble(InlineFlag("a", "unset", Literal("test")))
InlineFlag.PATTERN_IS_BYTES_LIKE = True
with pytest.raises(CompileError):
assemble(InlineFlag("L", "unset", Literal("test")))
InlineFlag.PATTERN_IS_BYTES_LIKE = False
with pytest.raises(CompileError):
assemble(InlineFlag("u", "unset", Literal("test")))
with pytest.raises(CompileError):
assemble(InlineFlag("i", "garbage-nothing", Literal("test")))
with pytest.raises(CompileError):
assemble(InlineFlag("i", "", Literal("test")))
with pytest.raises(CompileError):
assemble(InlineFlag("a", None, InlineFlag("L", None, Literal("test"))))
with pytest.raises(CompileError):
assemble(InlineFlag("L", None, InlineFlag("u", None, Literal("test"))))
with pytest.raises(CompileError):
assemble(InlineFlag("u", None, InlineFlag("a", None, Literal("test"))))
InlineFlag.PATTERN_IS_BYTES_LIKE = True
assert (
assemble(InlineFlag("L", None, InlineFlag("L", None, Literal("test"))))
== r"(?L:test)"
)

InlineFlag.PATTERN_IS_BYTES_LIKE = False
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pre-commit did not like my formatting of these tests, and made these really line-heavy trees. Originally each call was in it's own line.

assert (
assemble(
InlineFlag(
"u",
None,
InlineFlag(
"i",
None,
InlineFlag(
"m",
None,
InlineFlag(
"s",
None,
InlineFlag(
"i",
"unset",
InlineFlag(
"m",
"unset",
InlineFlag("s", "unset", Literal("test")),
),
),
),
),
),
),
)
== r"(?u-ims:test)"
)

assert (
assemble(
InlineFlag(
"i",
"unset",
InlineFlag(
"a",
None,
InlineFlag(
"m",
"unset",
InlineFlag(
"m",
None,
InlineFlag(
"i",
None,
InlineFlag(
"s",
None,
InlineFlag("s", "unset", Literal("test")),
),
),
),
),
),
),
)
== r"(?iam-s:test)"
)
InlineFlag.PATTERN_IS_BYTES_LIKE = save_PATTERN_IS_BYTES_LIKE