diff --git a/mathics_scanner/characters.py b/mathics_scanner/characters.py index f3aaeb9..057dc9c 100644 --- a/mathics_scanner/characters.py +++ b/mathics_scanner/characters.py @@ -6,7 +6,6 @@ and Unicode/ASCII. """ -import os import os.path as osp import re diff --git a/mathics_scanner/data/named-characters.yml b/mathics_scanner/data/named-characters.yml index 48e0cd8..db3daea 100644 --- a/mathics_scanner/data/named-characters.yml +++ b/mathics_scanner/data/named-characters.yml @@ -51,20 +51,25 @@ # control-sequence is both allowed in text-mode and math-mode, # then the same control sequence also appears in amslatex. # -# operator-name: If present, this symbol is a Mathics3 operator with -# whose class name is the given name. For example, the -# Divide operator, "/" is here. But some symbols like -# "`" or "." inside a number could be considered -# either an operator or as part of a lexical symbol. -# "operators.yml" may contain use of these symbols, while here we -# might not (or we might). Also, from an operator +# operator-name: If present, this symbol is a is part of some +# Mathics3 operator with whose class name is the +# given name. For example, the Divide operator, "/" +# is here. But some symbols like "`" or "." inside a +# number could be considered either an operator or as +# part of a lexical symbol. "operators.yml" may +# contain use of these symbols, while here we might +# not (or we might). Also, from an operator # perspective, an operator name like "Association" # might have *two* (bracketing) symbols associated # here: "LeftAssociation" and "RightAssociation". -# More operator information can be found in -# file "operators.yml". +# More operator information can be found in file +# "operators.yml". Note that some operators like +# Integrate, have several symbols, e.g. Integral and +# DifferentialD used in the operator. # # +# unicode-block: when given, the unicode block, or named ranges of code points. +## # unicode-equivalent: A unicode equivalent for the named-character, if it # exists. If it is the same as "ascii", please omit. # @@ -2150,18 +2155,18 @@ Digamma: wl-unicode-name: GREEK SMALL LETTER DIGAMMA # The WL symbol displays with a round dot at the left endpoint. -# The unicode equivalent shows omits this # When there is a tag over the edge, WL uses a bold variant # of the symbol. +# Note: not the same as \[Rule] or \[RightArrow] DirectedEdge: - amslatex: '\rightarrow' + amslatex: '\mathrel{\cdot\rightarrow}' esc-alias: de has-unicode-inverse: false is-letter-like: false operator-name: DirectedEdge - unicode-equivalent: "\u2192" - unicode-equivalent-name: RIGHTWARDS ARROW - unicode-reference: https://www.compart.com/en/unicode/U+2192 + unicode-equivalent: "\u21F4" + unicode-equivalent-name: RIGHT ARROW WITH SMALL CIRCLE + unicode-reference: https://www.compart.com/en/unicode/U+21F4 wl-reference: https://reference.wolfram.com/language/ref/character/DirectedEdge.html wl-unicode: "\uF3D5" @@ -8506,7 +8511,7 @@ RightAngleBracket: wl-unicode: "\u232A" wl-unicode-name: RIGHT-POINTING ANGLE BRACKET -# Note: not the same as \[Rule] +# Note: not the same as \[Rule] or \[DirectedEdge] RightArrow: amslatex: '\rightarrow' esc-alias: ' ->' @@ -8859,16 +8864,18 @@ RoundSpaceIndicator: wl-reference: https://reference.wolfram.com/language/ref/character/RoundSpaceIndicator.html wl-unicode: "\uF3B2" -# Note: not the same as \[RightArrow] +# Note: not the same as \[RightArrow] or \[DirectedEdge] Rule: + amslatex: '\vrightarrow' ascii: "->" esc-alias: "->" has-unicode-inverse: false is-letter-like: false operator-name: Rule - unicode-equivalent: "\u2192" - unicode-equivalent-name: RIGHTWARDS ARROW - unicode-reference: https://www.compart.com/en/unicode/U+2192 + unicode-equivalent: "\u21FE" + unicode-equivalent-name: RIGHTWARDS OPEN-HEADED ARROW + unicode-block: Arrows + unicode-reference: https://www.compart.com/en/unicode/U+21FE wl-reference: https://reference.wolfram.com/language/ref/character/Rule.html wl-unicode: "\uF522" diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 0b2cc69..1a0b841 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -290,14 +290,13 @@ def init_module(): ("Greater", r" \> "), ("Less", r" \< "), # https://reference.wolfram.com/language/ref/character/DirectedEdge.html - # The official Unicode value is \u2192. - ("DirectedEdge", r" -> | \uf3d5|\u2192"), + ("DirectedEdge", r" -> | \uf3d5|\u21F4 "), ("Or", r" (\|\|) | \u2228 "), ("And", r" (\&\&) | \u2227 "), ("RepeatedNull", r" \.\.\. "), ("Repeated", r" \.\. "), ("Alternatives", r" \| "), - ("Rule", r" (\-\>)|\uF522 "), + ("Rule", r" (\-\>)| \uF522|\u21FE"), ("RuleDelayed", r" (\:\>)|\uF51F "), # https://reference.wolfram.com/language/ref/character/UndirectedEdge.html # The official Unicode value is \u2194 diff --git a/test/test_general_yaml_sanity.py b/test/test_general_yaml_sanity.py index 727e1f4..e419877 100644 --- a/test/test_general_yaml_sanity.py +++ b/test/test_general_yaml_sanity.py @@ -40,6 +40,7 @@ def test_yaml_field_names(): "latex", "operator-name", "precedence", + "unicode-block", "unicode-equivalent", "unicode-equivalent-name", "unicode-reference", @@ -142,22 +143,29 @@ def test_unicode_name(): f"{k}'s unicode-equivalent doesn't have a unicode name (it's not valid unicode)" ) - real_name = v.get("unicode-equivalent-name") + name_in_yaml = v.get("unicode-equivalent-name") - if real_name is None: + if name_in_yaml is None: raise ValueError( "{k} has a unicode equivalent but doesn't have the unicode-equivalent-name field" ) if k == "VerticalBar": continue - assert real_name == expected_name or expected_name.startswith( - "MODIFIER LETTER SMALL SCHWA" - ), f"{k} has unicode-equivalent-name set to {real_name} but it should be {expected_name}" + + # If uncodedata gives a different name, then it is possible that the same Unicode character + # resides in two different code blocks, and in the YAML file we used one that uncodedata uses. + # Sadly, since terminals use uncodedata and don't have a way to specify a specific Unicode code + # block like Supplimental Arrows-C. + assert name_in_yaml == expected_name, ( + f"{k} has uncodedata set to {expected_name} but it YAML says it is {name_in_yaml}.\n" + "Change Unicode value in YAML to be unambiquous. " + ) else: - assert ( - "ascii" in v - ), f"{k} has unicode-equivalent-name set to {v['unicode-equivalent-name']} but it doesn't have a unicode or ascii equivalent" + assert "ascii" in v, ( + f"{k} has unicode-equivalent-name set to {v['unicode-equivalent-name']} " + "but it doesn't have a Unicode or ASCII equivalent" + ) def test_wl_unicode():