From af87eed334b8d7b61e0872b25a29e6e057203e47 Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 29 Jan 2026 07:47:39 -0500 Subject: [PATCH 1/5] Reassign Rule Unicode.. "Rule" is not the same things as "RightArrow". Use a unicode glyph that matches the Mathematica symbol better. Note that the arrowhead is not a triangle, but two lines. --- mathics_scanner/data/named-characters.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mathics_scanner/data/named-characters.yml b/mathics_scanner/data/named-characters.yml index 48e0cd8..30250af 100644 --- a/mathics_scanner/data/named-characters.yml +++ b/mathics_scanner/data/named-characters.yml @@ -8866,9 +8866,11 @@ Rule: has-unicode-inverse: false is-letter-like: false operator-name: Rule - unicode-equivalent: "\u2192" - unicode-equivalent-name: RIGHTWARDS ARROW - unicode-reference: https://www.compart.com/en/unicode/U+2192 + unicode-equivalent: "\u1F862" + ## It seems there are two names for this. The one used, is the one that our test checker finds. + # unicode-equivalent-name: WIDE-HEADED RIGHTWARDS LIGHT BARB ARROW + unicode-equivalent-name: GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI + DIGIT TWO + unicode-reference: https://www.compart.com/en/unicode/U+1F862 wl-reference: https://reference.wolfram.com/language/ref/character/Rule.html wl-unicode: "\uF522" From 94ae0ea5e9ad47e88fd8fb5fb346309a8f4d30fa Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 29 Jan 2026 11:01:58 -0500 Subject: [PATCH 2/5] Start an operator block. Also note that Mathics3 operators can use more than one symbol. --- mathics_scanner/data/named-characters.yml | 28 +++++++++++++---------- test/test_general_yaml_sanity.py | 15 +++++++++--- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/mathics_scanner/data/named-characters.yml b/mathics_scanner/data/named-characters.yml index 30250af..8335c80 100644 --- a/mathics_scanner/data/named-characters.yml +++ b/mathics_scanner/data/named-characters.yml @@ -51,20 +51,25 @@ # control-sequence is both allowed in text-mode and math-mode, # then the same control sequence also appears in amslatex. # -# operator-name: If present, this symbol is a Mathics3 operator with -# whose class name is the given name. For example, the -# Divide operator, "/" is here. But some symbols like -# "`" or "." inside a number could be considered -# either an operator or as part of a lexical symbol. -# "operators.yml" may contain use of these symbols, while here we -# might not (or we might). Also, from an operator +# operator-name: If present, this symbol is a is part of some +# Mathics3 operator with whose class name is the +# given name. For example, the Divide operator, "/" +# is here. But some symbols like "`" or "." inside a +# number could be considered either an operator or as +# part of a lexical symbol. "operators.yml" may +# contain use of these symbols, while here we might +# not (or we might). Also, from an operator # perspective, an operator name like "Association" # might have *two* (bracketing) symbols associated # here: "LeftAssociation" and "RightAssociation". -# More operator information can be found in -# file "operators.yml". +# More operator information can be found in file +# "operators.yml". Note that some operators like +# Integrate, have several symbols, e.g. Integral and +# DifferentialD used in the operator. # # +# unicode-block: when given, the unicode block, or named ranges of code points. +## # unicode-equivalent: A unicode equivalent for the named-character, if it # exists. If it is the same as "ascii", please omit. # @@ -8867,9 +8872,8 @@ Rule: is-letter-like: false operator-name: Rule unicode-equivalent: "\u1F862" - ## It seems there are two names for this. The one used, is the one that our test checker finds. - # unicode-equivalent-name: WIDE-HEADED RIGHTWARDS LIGHT BARB ARROW - unicode-equivalent-name: GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI + DIGIT TWO + unicode-equivalent-name: WIDE-HEADED RIGHTWARDS LIGHT BARB ARROW + unicode-block: Arrows unicode-reference: https://www.compart.com/en/unicode/U+1F862 wl-reference: https://reference.wolfram.com/language/ref/character/Rule.html wl-unicode: "\uF522" diff --git a/test/test_general_yaml_sanity.py b/test/test_general_yaml_sanity.py index 727e1f4..12f3e25 100644 --- a/test/test_general_yaml_sanity.py +++ b/test/test_general_yaml_sanity.py @@ -40,6 +40,7 @@ def test_yaml_field_names(): "latex", "operator-name", "precedence", + "unicode-block", "unicode-equivalent", "unicode-equivalent-name", "unicode-reference", @@ -151,9 +152,17 @@ def test_unicode_name(): if k == "VerticalBar": continue - assert real_name == expected_name or expected_name.startswith( - "MODIFIER LETTER SMALL SCHWA" - ), f"{k} has unicode-equivalent-name set to {real_name} but it should be {expected_name}" + + # uncodedata sometimes gives a different name, and there is no way that I + # know of to allow it narrow its results to a particular unicode block, + # or find out what unicode block it is useing + if real_name not in ( + "WIDE-HEADED RIGHTWARDS LIGHT BARB ARROW", + # "MODIFIER LETTER SMALL SCHWA", + ): + assert ( + real_name == expected_name + ), f"{k} has unicode-equivalent-name set to {real_name} but it should be {expected_name}" else: assert ( "ascii" in v From a392925af41b3f1d31478780f1743acfc92dfa98 Mon Sep 17 00:00:00 2001 From: rocky Date: Fri, 30 Jan 2026 17:16:17 -0500 Subject: [PATCH 3/5] Reassign Rule unicode to arrow block... Also make sure to parse this unicode symbol as Rule. --- mathics_scanner/data/named-characters.yml | 6 +++--- mathics_scanner/tokeniser.py | 2 +- test/test_general_yaml_sanity.py | 20 ++++++++++---------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/mathics_scanner/data/named-characters.yml b/mathics_scanner/data/named-characters.yml index 8335c80..cebff7b 100644 --- a/mathics_scanner/data/named-characters.yml +++ b/mathics_scanner/data/named-characters.yml @@ -8871,10 +8871,10 @@ Rule: has-unicode-inverse: false is-letter-like: false operator-name: Rule - unicode-equivalent: "\u1F862" - unicode-equivalent-name: WIDE-HEADED RIGHTWARDS LIGHT BARB ARROW + unicode-equivalent: "\u21FE" + unicode-equivalent-name: Rightwards Open-Headed Arrow unicode-block: Arrows - unicode-reference: https://www.compart.com/en/unicode/U+1F862 + unicode-reference: https://www.compart.com/en/unicode/U+21FE wl-reference: https://reference.wolfram.com/language/ref/character/Rule.html wl-unicode: "\uF522" diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 0b2cc69..3ceeff0 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -297,7 +297,7 @@ def init_module(): ("RepeatedNull", r" \.\.\. "), ("Repeated", r" \.\. "), ("Alternatives", r" \| "), - ("Rule", r" (\-\>)|\uF522 "), + ("Rule", r" (\-\>)| \uF522|\u21FE"), ("RuleDelayed", r" (\:\>)|\uF51F "), # https://reference.wolfram.com/language/ref/character/UndirectedEdge.html # The official Unicode value is \u2194 diff --git a/test/test_general_yaml_sanity.py b/test/test_general_yaml_sanity.py index 12f3e25..6fbedc6 100644 --- a/test/test_general_yaml_sanity.py +++ b/test/test_general_yaml_sanity.py @@ -153,16 +153,16 @@ def test_unicode_name(): if k == "VerticalBar": continue - # uncodedata sometimes gives a different name, and there is no way that I - # know of to allow it narrow its results to a particular unicode block, - # or find out what unicode block it is useing - if real_name not in ( - "WIDE-HEADED RIGHTWARDS LIGHT BARB ARROW", - # "MODIFIER LETTER SMALL SCHWA", - ): - assert ( - real_name == expected_name - ), f"{k} has unicode-equivalent-name set to {real_name} but it should be {expected_name}" + # # uncodedata sometimes gives a different name, and there is no way that I + # # know of to allow it narrow its results to a particular unicode block, + # # or find out what unicode block it is useing + # if real_name not in ( + # "WIDE-HEADED RIGHTWARDS LIGHT BARB ARROW", + # # "MODIFIER LETTER SMALL SCHWA", + # ): + # assert ( + # real_name == expected_name + # ), f"{k} has unicode-equivalent-name set to {real_name} but it should be {expected_name}" else: assert ( "ascii" in v From ef724090ba38853a84b0df26488a034f9d1ad2d7 Mon Sep 17 00:00:00 2001 From: rocky Date: Fri, 30 Jan 2026 17:41:01 -0500 Subject: [PATCH 4/5] Adjust test messages --- mathics_scanner/data/named-characters.yml | 2 +- test/test_general_yaml_sanity.py | 29 +++++++++++------------ 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/mathics_scanner/data/named-characters.yml b/mathics_scanner/data/named-characters.yml index cebff7b..374aca1 100644 --- a/mathics_scanner/data/named-characters.yml +++ b/mathics_scanner/data/named-characters.yml @@ -8872,7 +8872,7 @@ Rule: is-letter-like: false operator-name: Rule unicode-equivalent: "\u21FE" - unicode-equivalent-name: Rightwards Open-Headed Arrow + unicode-equivalent-name: RIGHTWARDS OPEN-HEADED ARROW unicode-block: Arrows unicode-reference: https://www.compart.com/en/unicode/U+21FE wl-reference: https://reference.wolfram.com/language/ref/character/Rule.html diff --git a/test/test_general_yaml_sanity.py b/test/test_general_yaml_sanity.py index 6fbedc6..dcf6d6d 100644 --- a/test/test_general_yaml_sanity.py +++ b/test/test_general_yaml_sanity.py @@ -143,9 +143,9 @@ def test_unicode_name(): f"{k}'s unicode-equivalent doesn't have a unicode name (it's not valid unicode)" ) - real_name = v.get("unicode-equivalent-name") + name_in_yaml = v.get("unicode-equivalent-name") - if real_name is None: + if name_in_yaml is None: raise ValueError( "{k} has a unicode equivalent but doesn't have the unicode-equivalent-name field" ) @@ -153,20 +153,19 @@ def test_unicode_name(): if k == "VerticalBar": continue - # # uncodedata sometimes gives a different name, and there is no way that I - # # know of to allow it narrow its results to a particular unicode block, - # # or find out what unicode block it is useing - # if real_name not in ( - # "WIDE-HEADED RIGHTWARDS LIGHT BARB ARROW", - # # "MODIFIER LETTER SMALL SCHWA", - # ): - # assert ( - # real_name == expected_name - # ), f"{k} has unicode-equivalent-name set to {real_name} but it should be {expected_name}" + # If uncodedata gives a different name, then it is possible that the same Unicode character + # resides in two different code blocks, and in the YAML file we used one that uncodedata uses. + # Sadly, since terminals use uncodedata and don't have a way to specify a specific Unicode code + # block like Supplimental Arrows-C. + assert name_in_yaml == expected_name, ( + f"{k} has unicodedata set to {expected_name} but it YAML says it is {name_in_yaml}.\n" + "Change Unicode value in YAML to be unambiquous. " + ) else: - assert ( - "ascii" in v - ), f"{k} has unicode-equivalent-name set to {v['unicode-equivalent-name']} but it doesn't have a unicode or ascii equivalent" + assert "ascii" in v, ( + f"{k} has unicode-equivalent-name set to {v['unicode-equivalent-name']} " + "but it doesn't have a Unicode or ASCII equivalent" + ) def test_wl_unicode(): From d9807e9ff89c41140e80a873f305a1f389471b3b Mon Sep 17 00:00:00 2001 From: rocky Date: Sat, 31 Jan 2026 06:21:17 -0500 Subject: [PATCH 5/5] Separate DirectedEdge from RightArrow --- mathics_scanner/characters.py | 1 - mathics_scanner/data/named-characters.yml | 15 ++++++++------- mathics_scanner/tokeniser.py | 3 +-- test/test_general_yaml_sanity.py | 2 +- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/mathics_scanner/characters.py b/mathics_scanner/characters.py index f3aaeb9..057dc9c 100644 --- a/mathics_scanner/characters.py +++ b/mathics_scanner/characters.py @@ -6,7 +6,6 @@ and Unicode/ASCII. """ -import os import os.path as osp import re diff --git a/mathics_scanner/data/named-characters.yml b/mathics_scanner/data/named-characters.yml index 374aca1..db3daea 100644 --- a/mathics_scanner/data/named-characters.yml +++ b/mathics_scanner/data/named-characters.yml @@ -2155,18 +2155,18 @@ Digamma: wl-unicode-name: GREEK SMALL LETTER DIGAMMA # The WL symbol displays with a round dot at the left endpoint. -# The unicode equivalent shows omits this # When there is a tag over the edge, WL uses a bold variant # of the symbol. +# Note: not the same as \[Rule] or \[RightArrow] DirectedEdge: - amslatex: '\rightarrow' + amslatex: '\mathrel{\cdot\rightarrow}' esc-alias: de has-unicode-inverse: false is-letter-like: false operator-name: DirectedEdge - unicode-equivalent: "\u2192" - unicode-equivalent-name: RIGHTWARDS ARROW - unicode-reference: https://www.compart.com/en/unicode/U+2192 + unicode-equivalent: "\u21F4" + unicode-equivalent-name: RIGHT ARROW WITH SMALL CIRCLE + unicode-reference: https://www.compart.com/en/unicode/U+21F4 wl-reference: https://reference.wolfram.com/language/ref/character/DirectedEdge.html wl-unicode: "\uF3D5" @@ -8511,7 +8511,7 @@ RightAngleBracket: wl-unicode: "\u232A" wl-unicode-name: RIGHT-POINTING ANGLE BRACKET -# Note: not the same as \[Rule] +# Note: not the same as \[Rule] or \[DirectedEdge] RightArrow: amslatex: '\rightarrow' esc-alias: ' ->' @@ -8864,8 +8864,9 @@ RoundSpaceIndicator: wl-reference: https://reference.wolfram.com/language/ref/character/RoundSpaceIndicator.html wl-unicode: "\uF3B2" -# Note: not the same as \[RightArrow] +# Note: not the same as \[RightArrow] or \[DirectedEdge] Rule: + amslatex: '\vrightarrow' ascii: "->" esc-alias: "->" has-unicode-inverse: false diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 3ceeff0..1a0b841 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -290,8 +290,7 @@ def init_module(): ("Greater", r" \> "), ("Less", r" \< "), # https://reference.wolfram.com/language/ref/character/DirectedEdge.html - # The official Unicode value is \u2192. - ("DirectedEdge", r" -> | \uf3d5|\u2192"), + ("DirectedEdge", r" -> | \uf3d5|\u21F4 "), ("Or", r" (\|\|) | \u2228 "), ("And", r" (\&\&) | \u2227 "), ("RepeatedNull", r" \.\.\. "), diff --git a/test/test_general_yaml_sanity.py b/test/test_general_yaml_sanity.py index dcf6d6d..e419877 100644 --- a/test/test_general_yaml_sanity.py +++ b/test/test_general_yaml_sanity.py @@ -158,7 +158,7 @@ def test_unicode_name(): # Sadly, since terminals use uncodedata and don't have a way to specify a specific Unicode code # block like Supplimental Arrows-C. assert name_in_yaml == expected_name, ( - f"{k} has unicodedata set to {expected_name} but it YAML says it is {name_in_yaml}.\n" + f"{k} has uncodedata set to {expected_name} but it YAML says it is {name_in_yaml}.\n" "Change Unicode value in YAML to be unambiquous. " ) else: