diff --git a/.github/workflows/haskell.yml b/.github/workflows/haskell.yml index c7e9572..a2eb2c7 100644 --- a/.github/workflows/haskell.yml +++ b/.github/workflows/haskell.yml @@ -47,7 +47,6 @@ jobs: fail-fast: false matrix: name: - - 8.0.2 - 8.2.2 - 8.4.4 - 8.6.5 @@ -65,14 +64,6 @@ jobs: - hlint include: - - name: 8.0.2 - use_haskell_actions: true - ghc_version: 8.0.2 - runner: ubuntu-latest - cabal_version: 3.2.0.0 - pack_options: DISABLE_TEST=y - ignore_error: false - - name: 8.2.2 use_haskell_actions: true ghc_version: 8.2.2 diff --git a/experimental/icu/cbits/icu.c b/experimental/icu/cbits/icu.c index d4639a1..f8ab992 100644 --- a/experimental/icu/cbits/icu.c +++ b/experimental/icu/cbits/icu.c @@ -52,6 +52,10 @@ int32_t __hs_uscript_getScriptExtensions return uscript_getScriptExtensions(codepoint, scripts, capacity, &err); } +int __hs_getMaxScript(void) { + return u_getIntPropertyMaxValue(UCHAR_SCRIPT); +} + const char * __hs_uscript_getShortName(UScriptCode scriptCode) { return uscript_getShortName(scriptCode); } diff --git a/experimental/icu/cbits/icu.h b/experimental/icu/cbits/icu.h index 97062d5..f910451 100644 --- a/experimental/icu/cbits/icu.h +++ b/experimental/icu/cbits/icu.h @@ -41,6 +41,8 @@ int32_t __hs_uscript_getScriptExtensions , UScriptCode * scripts , int32_t capacity ); +int __hs_getMaxScript(void); + const char * __hs_uscript_getShortName(UScriptCode scriptCode); #endif diff --git a/experimental/icu/lib/ICU/Scripts.hsc b/experimental/icu/lib/ICU/Scripts.hsc index 14ca12b..0460ade 100644 --- a/experimental/icu/lib/ICU/Scripts.hsc +++ b/experimental/icu/lib/ICU/Scripts.hsc @@ -2,6 +2,7 @@ module ICU.Scripts ( Script(..) + , maxSupportedScript , script , codepointScript , scriptShortName @@ -35,9 +36,15 @@ foreign import ccall safe "icu.h __hs_uscript_getScript" uscript_getScript foreign import ccall unsafe "icu.h __hs_uscript_getScriptExtensions" uscript_getScriptExtensions :: UChar32 -> Ptr UScriptCode -> Int32 -> IO Int32 +foreign import ccall unsafe "icu.h __hs_getMaxScript" getMaxScript + :: IO CInt + foreign import ccall unsafe "icu.h __hs_uscript_getShortName" uscript_getShortName :: UScriptCode -> IO CString +maxSupportedScript :: Script +maxSupportedScript = toEnum (fromIntegral (unsafePerformIO getMaxScript)) + {-# INLINE codepointScript #-} codepointScript :: Word32 -> Script -- codepointScript = toEnum . unsafePerformIO . with 0 . uscript_getScript @@ -65,13 +72,13 @@ scriptExtensionsRaw capacity = 30 scriptShortName :: Script -> String -scriptShortName - = unsafePerformIO - . (uscript_getShortName . fromIntegral . fromEnum >=> peekCString) +scriptShortName s = if s <= maxSupportedScript + then unsafePerformIO ((uscript_getShortName . fromIntegral . fromEnum >=> peekCString) s) + else "" -- See: https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/uscript_8h_source.html --- Last sync: 2023-03-09 +-- Last sync: 2025-09-13 data Script = Common -- ^ USCRIPT_COMMON = 0 @@ -282,4 +289,9 @@ data Script | Sunu -- ^ USCRIPT_SUNUWAR = 205 | Todr -- ^ USCRIPT_TODHRI = 206 | Tutg -- ^ USCRIPT_TULU_TIGALARI = 207 - deriving (Bounded, Enum, Eq, Show) + | Berf -- ^ USCRIPT_BERIA_ERFE = 208 + | Sidt -- ^ USCRIPT_SIDETIC = 209 + | Tayo -- ^ USCRIPT_TAI_YO = 210 + | Tols -- ^ USCRIPT_TOLONG_SIKI = 211 + | Hntl -- ^ USCRIPT_TRADITIONAL_HAN_WITH_LATIN = 212 + deriving (Bounded, Enum, Eq, Ord, Show) diff --git a/experimental/unicode-data-text/unicode-data-text.cabal b/experimental/unicode-data-text/unicode-data-text.cabal index 7916aec..f395e1c 100644 --- a/experimental/unicode-data-text/unicode-data-text.cabal +++ b/experimental/unicode-data-text/unicode-data-text.cabal @@ -16,8 +16,7 @@ copyright: 2022 Composewell Technologies and Contributors category: Data,Text,Unicode stability: Experimental build-type: Simple -tested-with: GHC==8.0.2 - , GHC==8.2.2 +tested-with: GHC==8.2.2 , GHC==8.4.4 , GHC==8.6.5 , GHC==8.8.4 diff --git a/ucd2haskell/exe/UCD2Haskell/Generator.hs b/ucd2haskell/exe/UCD2Haskell/Generator.hs index 444c558..b258d27 100644 --- a/ucd2haskell/exe/UCD2Haskell/Generator.hs +++ b/ucd2haskell/exe/UCD2Haskell/Generator.hs @@ -8,6 +8,7 @@ module UCD2Haskell.Generator ( -- * Recipe FileRecipe(..) -- * Generator + , UnicodeSourceType(..) , runGenerator , moduleToFileName , dirFromFileName @@ -49,7 +50,7 @@ import Data.Maybe (mapMaybe) import Data.Ratio ((%)) import qualified Data.Set as Set import Data.String (IsString (..)) -import Data.Version (Version, showVersion) +import Data.Version (Version, makeVersion, showVersion) import qualified Data.Vector.Unboxed as V import Data.Word (Word32, Word8) import Debug.Trace (trace) @@ -99,25 +100,38 @@ type GeneratorRecipe a = [FileRecipe a] -- Generator -------------------------------------------------------------------------------- +data UnicodeSourceType = UCD | Security + moduleToFileName :: String -> String moduleToFileName = map (\x -> if x == '.' then '/' else x) dirFromFileName :: String -> String dirFromFileName = reverse . dropWhile (/= '/') . reverse -moduleFileEmitter :: Version -> FilePath -> FilePath -> ModuleRecipe a -> Fold a (IO ()) -moduleFileEmitter version unicodeSourceFile outdir (modName, fldGen) = +moduleFileEmitter :: Version -> UnicodeSourceType -> FilePath -> FilePath -> ModuleRecipe a -> Fold a (IO ()) +moduleFileEmitter version sourceType unicodeSourceFile outdir (modName, fldGen) = rmapFold action $ fldGen (BB.string7 modName) where pretext = mconcat - [ "-- autogenerated from https://www.unicode.org/Public/" - , BB.string7 (showVersion version) - , "/ucd/" - , BB.string7 unicodeSourceFile - ,"\n" - ] + $ "-- autogenerated from https://www.unicode.org/Public/" + : case sourceType of + Security | version < makeVersion [17, 0, 0] -> + [ "security/" + , BB.string7 (showVersion version) + , "/" + , BB.string7 unicodeSourceFile + , "\n" + ] + _ -> + [ BB.string7 (showVersion version) + , case sourceType of + UCD -> "/ucd/" + Security -> "/security/" + , BB.string7 unicodeSourceFile + , "\n" + ] outfile = outdir moduleToFileName modName <.> "hs" outfiledir = dirFromFileName outfile action c = do @@ -133,6 +147,7 @@ printCpuTime = do runGenerator :: Version + -> UnicodeSourceType -> FilePath -> FilePath -> (B.ByteString -> [a]) @@ -140,7 +155,7 @@ runGenerator :: -> [String] -> GeneratorRecipe a -> IO () -runGenerator version indir file transformLines outdir patterns recipes = do +runGenerator version sourceType indir file transformLines outdir patterns recipes = do raw <- B.readFile (indir file) sequence_ (runFold combinedFld (transformLines raw)) @@ -149,7 +164,7 @@ runGenerator version indir file transformLines outdir patterns recipes = do generatedFolds = mapMaybe toModuleEmitter recipes toModuleEmitter = \case ModuleRecipe name f -> if all (`L.isSubsequenceOf` name) patterns - then Just (moduleFileEmitter version file outdir (name, f)) + then Just (moduleFileEmitter version sourceType file outdir (name, f)) else Nothing combinedFld = distribute generatedFolds diff --git a/ucd2haskell/exe/UCD2Haskell/Generator/Core.hs b/ucd2haskell/exe/UCD2Haskell/Generator/Core.hs index a10b863..f69f14c 100644 --- a/ucd2haskell/exe/UCD2Haskell/Generator/Core.hs +++ b/ucd2haskell/exe/UCD2Haskell/Generator/Core.hs @@ -30,7 +30,7 @@ import qualified UCD2Haskell.Modules.UnicodeData.Decomposition as Decomposition import qualified UCD2Haskell.Modules.UnicodeData.GeneralCategory as GeneralCategory import qualified UCD2Haskell.Modules.UnicodeData.SimpleCaseMappings as SimpleCaseMappings import qualified UCD2Haskell.Modules.Version as Version -import UCD2Haskell.Generator (runGenerator) +import UCD2Haskell.Generator (UnicodeSourceType(..), runGenerator) generateModules :: Version -> FilePath -> FilePath -> [String] -> [String] -> IO () generateModules version indir outdir patterns props = do @@ -44,7 +44,7 @@ generateModules version indir outdir patterns props = do specialCasings <- SpecialCasings.parse <$> B.readFile (indir "SpecialCasing.txt") - let runGenerator' = runGenerator version indir + let runGenerator' = runGenerator version UCD indir runGenerator' "Blocks.txt" diff --git a/ucd2haskell/exe/UCD2Haskell/Generator/Names.hs b/ucd2haskell/exe/UCD2Haskell/Generator/Names.hs index baaf07b..c5f7194 100644 --- a/ucd2haskell/exe/UCD2Haskell/Generator/Names.hs +++ b/ucd2haskell/exe/UCD2Haskell/Generator/Names.hs @@ -16,12 +16,13 @@ import qualified Unicode.CharacterDatabase.Parser.NameAliases as NA import qualified UCD2Haskell.Modules.UnicodeData.DerivedNames as Names import qualified UCD2Haskell.Modules.UnicodeData.NameAliases as NameAliases import qualified UCD2Haskell.Modules.Version as Version -import UCD2Haskell.Generator (runGenerator) +import UCD2Haskell.Generator (UnicodeSourceType(..), runGenerator) generateModules :: Version -> FilePath -> FilePath -> [String] -> IO () generateModules version indir outdir patterns = do runGenerator version + UCD indir ("extracted" "DerivedName.txt") N.parse @@ -31,6 +32,7 @@ generateModules version indir outdir patterns = do runGenerator version + UCD indir "NameAliases.txt" NA.parse diff --git a/ucd2haskell/exe/UCD2Haskell/Generator/Scripts.hs b/ucd2haskell/exe/UCD2Haskell/Generator/Scripts.hs index cd5843a..c74caca 100644 --- a/ucd2haskell/exe/UCD2Haskell/Generator/Scripts.hs +++ b/ucd2haskell/exe/UCD2Haskell/Generator/Scripts.hs @@ -16,7 +16,7 @@ import qualified Unicode.CharacterDatabase.Parser.Properties.Single as Prop import qualified UCD2Haskell.Modules.Scripts as Scripts import qualified UCD2Haskell.Modules.ScriptsExtensions as ScriptsExtensions import qualified UCD2Haskell.Modules.Version as Version -import UCD2Haskell.Generator (runGenerator) +import UCD2Haskell.Generator (UnicodeSourceType(..), runGenerator) generateModules :: Version -> FilePath -> FilePath -> [String] -> IO () generateModules version indir outdir patterns = do @@ -28,6 +28,7 @@ generateModules version indir outdir patterns = do runGenerator version + UCD indir "Scripts.txt" Prop.parse diff --git a/ucd2haskell/exe/UCD2Haskell/Generator/Security.hs b/ucd2haskell/exe/UCD2Haskell/Generator/Security.hs index 9256c62..5a2d2b9 100644 --- a/ucd2haskell/exe/UCD2Haskell/Generator/Security.hs +++ b/ucd2haskell/exe/UCD2Haskell/Generator/Security.hs @@ -16,12 +16,13 @@ import qualified UCD2Haskell.Modules.Security.IdentifierStatus as IdentifierStat import qualified UCD2Haskell.Modules.Security.IdentifierType as IdentifierType import qualified UCD2Haskell.Modules.Security.IntentionalConfusables as IntentionalConfusables import qualified UCD2Haskell.Modules.Version as Version -import UCD2Haskell.Generator (runGenerator) +import UCD2Haskell.Generator (UnicodeSourceType(..), runGenerator) generateModules :: Version -> FilePath -> FilePath -> [String] -> IO () generateModules version indir outdir patterns = do runGenerator version + Security indir "IdentifierStatus.txt" Prop.parse @@ -31,6 +32,7 @@ generateModules version indir outdir patterns = do runGenerator version + Security indir "IdentifierType.txt" Prop.parse @@ -40,6 +42,7 @@ generateModules version indir outdir patterns = do runGenerator version + Security indir "confusables.txt" Prop.parseMultipleValues @@ -49,6 +52,7 @@ generateModules version indir outdir patterns = do runGenerator version + Security indir "intentional.txt" Prop.parse diff --git a/ucd2haskell/exe/UCD2Haskell/Modules/UnicodeData/DerivedNames.hs b/ucd2haskell/exe/UCD2Haskell/Modules/UnicodeData/DerivedNames.hs index 475fc74..0f8471d 100644 --- a/ucd2haskell/exe/UCD2Haskell/Modules/UnicodeData/DerivedNames.hs +++ b/ucd2haskell/exe/UCD2Haskell/Modules/UnicodeData/DerivedNames.hs @@ -39,10 +39,16 @@ genNamesModule moduleName = Fold step initial done step acc = \case N.SingleChar{..} -> step' acc char name - N.CharRange{..} -> foldl' - (\a c -> step' a c (mkName prefix c)) - acc - [start..end] + N.CharRange{..} -> if prefix `elem` rangePrefixes + then foldl' + (\a c -> step' a c (mkName prefix c)) + acc + [start..end] + else error . mconcat $ + [ "Unexpected name range: " + , show prefix + , ". Please update the generator and the " + , "Unicode.Char.General.Names* modules" ] mkName prefix c = prefix <> showHexCodepointBS c @@ -95,6 +101,14 @@ genNamesModule moduleName = Fold step initial done nushu = 0xf5 hangul = 0x80 + rangePrefixes = + [ "CJK COMPATIBILITY IDEOGRAPH-" + , "CJK UNIFIED IDEOGRAPH-" + , "TANGUT IDEOGRAPH-" + , "EGYPTIAN HIEROGLYPH-" + , "KHITAN SMALL SCRIPT CHARACTER-" + , "NUSHU CHARACTER-" ] + encodeName name | BS.take 28 name == "CJK COMPATIBILITY IDEOGRAPH-" = ("", cjkCompat, 0, True) | BS.take 22 name == "CJK UNIFIED IDEOGRAPH-" = ("", cjkUnified, 0, True) @@ -102,7 +116,7 @@ genNamesModule moduleName = Fold step initial done | BS.take 20 name == "EGYPTIAN HIEROGLYPH-" = ("", egyptianHieroglyph, 0, True) | BS.take 30 name == "KHITAN SMALL SCRIPT CHARACTER-" = ("", khitan, 0, True) | BS.take 16 name == "NUSHU CHARACTER-" = ("", nushu, 0, True) - | BS.take 16 name == "HANGUL SYLLABLE " = + | BS.take 16 name == "HANGUL SYLLABLE " = let !name' = BS.drop 16 name; !len = BS.length name' in if len <= 12 then (name', hangul + len, len, True) diff --git a/unicode-data-names/Changelog.md b/unicode-data-names/Changelog.md index 6409d22..d38e835 100644 --- a/unicode-data-names/Changelog.md +++ b/unicode-data-names/Changelog.md @@ -1,6 +1,6 @@ # Changelog -## 0.5.0 (August 2025) +## 0.5.0 (September 2025) - Updated to [Unicode 16.0.0](https://www.unicode.org/versions/Unicode16.0.0/). diff --git a/unicode-data-names/test/ICU/NamesSpec.hs b/unicode-data-names/test/ICU/NamesSpec.hs index 17a4a76..928dbb1 100644 --- a/unicode-data-names/test/ICU/NamesSpec.hs +++ b/unicode-data-names/test/ICU/NamesSpec.hs @@ -57,6 +57,8 @@ spec = do #endif where ourUnicodeVersion = versionBranch U.unicodeVersion + theirUnicodeVersion = take 3 (versionBranch ICU.unicodeVersion) + versionMismatch = ourUnicodeVersion /= theirUnicodeVersion showCodePoint c = ("U+" ++) . fmap U.toUpper . showHex (U.ord c) -- There is no feature to display warnings other than `trace`, so @@ -85,6 +87,8 @@ spec = do -- Unicode version mismatch: char is not mapped in one of the libs: -- add warning. | ageMismatch c = acc{warnings=c : warnings acc} + -- Unicode version mismatch + | versionMismatch = acc{warnings=c : warnings acc} -- Error | otherwise = let !msg = mconcat diff --git a/unicode-data-names/unicode-data-names.cabal b/unicode-data-names/unicode-data-names.cabal index 574b3a3..d9597b0 100644 --- a/unicode-data-names/unicode-data-names.cabal +++ b/unicode-data-names/unicode-data-names.cabal @@ -20,8 +20,7 @@ copyright: 2022 Composewell Technologies and Contributors category: Data,Text,Unicode stability: Experimental build-type: Simple -tested-with: GHC==8.0.2 - , GHC==8.2.2 +tested-with: GHC==8.2.2 , GHC==8.4.4 , GHC==8.6.5 , GHC==8.8.4 diff --git a/unicode-data-scripts/Changelog.md b/unicode-data-scripts/Changelog.md index 4c8e47d..734f4d4 100644 --- a/unicode-data-scripts/Changelog.md +++ b/unicode-data-scripts/Changelog.md @@ -1,6 +1,6 @@ # Changelog -## 0.5.0 (August 2025) +## 0.5.0 (September 2025) - Updated to [Unicode 16.0.0](https://www.unicode.org/versions/Unicode16.0.0/). diff --git a/unicode-data-scripts/test/ICU/ScriptsSpec.hs b/unicode-data-scripts/test/ICU/ScriptsSpec.hs index 407184c..c971c97 100644 --- a/unicode-data-scripts/test/ICU/ScriptsSpec.hs +++ b/unicode-data-scripts/test/ICU/ScriptsSpec.hs @@ -8,9 +8,8 @@ import Data.Char (toUpper, ord) import Data.Foldable (traverse_) import qualified Data.List as L import qualified Data.List.NonEmpty as NE -import Data.Maybe (isJust) import Data.Version (versionBranch, showVersion) -import Debug.Trace (traceM) +import Debug.Trace (trace, traceM) import Numeric (showHex) import Test.Hspec ( Spec, it, expectationFailure, shouldSatisfy ) @@ -20,9 +19,19 @@ import qualified Unicode.Char.General.Scripts as S spec :: Spec spec = do - let icuScripts = (\s -> (ICU.scriptShortName s, s)) <$> [minBound..maxBound] it "scriptShortName" - let check = isJust . (`lookup` icuScripts) . S.scriptShortName + let check s = case toIcuScript s of + Just _ -> True + Nothing + | versionMismatch -> trace (mconcat + [ "[WARNING] Cannot test scriptShortName for " + , show s + , ": incompatible ICU version (" + , showVersion ICU.unicodeVersion + , " /= " + , showVersion S.unicodeVersion + , ")." ]) True + | otherwise -> False in traverse_ (`shouldSatisfy` check) [minBound..maxBound] it "script" let check c @@ -48,10 +57,23 @@ spec = do let { check s = case lookup (S.scriptShortName s) icuScripts of - Nothing -> error ("Cannot convert script: " ++ show s) + Nothing + | ourUnicodeVersion > theirUnicodeVersion + -> traceM . mconcat $ + [ "[WARNING] Cannot convert script " + , show s + , ": incompatible ICU version (" + , showVersion ICU.unicodeVersion + , " /= " + , showVersion S.unicodeVersion + , "). " + , "Max supported ICU script:" + , show ICU.maxSupportedScript ] + | otherwise -> error ("Cannot convert script: " ++ show s) Just s' | def == defRef -> pure () - | ourUnicodeVersion /= theirUnicodeVersion -> traceM . mconcat $ + | ourUnicodeVersion /= theirUnicodeVersion + -> traceM . mconcat $ [ "[WARNING] Cannot test " , show s , ": incompatible ICU version (" @@ -106,3 +128,5 @@ spec = do theirUnicodeVersion = take 3 (versionBranch ICU.unicodeVersion) showCodePoint c = ("U+" ++) . fmap toUpper $ showHex (ord c) "" versionMismatch = ourUnicodeVersion /= theirUnicodeVersion + icuScripts = (\s -> (ICU.scriptShortName s, s)) <$> [minBound..maxBound] + toIcuScript = (`lookup` icuScripts) . S.scriptShortName diff --git a/unicode-data-scripts/unicode-data-scripts.cabal b/unicode-data-scripts/unicode-data-scripts.cabal index 4c6fa26..83bbbd2 100644 --- a/unicode-data-scripts/unicode-data-scripts.cabal +++ b/unicode-data-scripts/unicode-data-scripts.cabal @@ -20,8 +20,7 @@ copyright: 2022 Composewell Technologies and Contributors category: Data,Text,Unicode stability: Experimental build-type: Simple -tested-with: GHC==8.0.2 - , GHC==8.2.2 +tested-with: GHC==8.2.2 , GHC==8.4.4 , GHC==8.6.5 , GHC==8.8.4 diff --git a/unicode-data-security/Changelog.md b/unicode-data-security/Changelog.md index 33ee27b..8c18308 100644 --- a/unicode-data-security/Changelog.md +++ b/unicode-data-security/Changelog.md @@ -1,6 +1,6 @@ # Changelog -## 0.5.0 (August 2025) +## 0.5.0 (September 2025) - Updated to [Unicode 16.0.0](https://www.unicode.org/versions/Unicode16.0.0/). diff --git a/unicode-data-security/lib/Unicode/Internal/Char/Security/Confusables.hs b/unicode-data-security/lib/Unicode/Internal/Char/Security/Confusables.hs index 8d7b3a9..c4a8e33 100644 --- a/unicode-data-security/lib/Unicode/Internal/Char/Security/Confusables.hs +++ b/unicode-data-security/lib/Unicode/Internal/Char/Security/Confusables.hs @@ -1,4 +1,4 @@ --- autogenerated from https://www.unicode.org/Public/16.0.0/ucd/confusables.txt +-- autogenerated from https://www.unicode.org/Public/security/16.0.0/confusables.txt -- | -- Module : Unicode.Internal.Char.Security.Confusables -- Copyright : (c) 2022 Composewell Technologies and Contributors diff --git a/unicode-data-security/lib/Unicode/Internal/Char/Security/IdentifierStatus.hs b/unicode-data-security/lib/Unicode/Internal/Char/Security/IdentifierStatus.hs index 6e0ef9d..61565c1 100644 --- a/unicode-data-security/lib/Unicode/Internal/Char/Security/IdentifierStatus.hs +++ b/unicode-data-security/lib/Unicode/Internal/Char/Security/IdentifierStatus.hs @@ -1,4 +1,4 @@ --- autogenerated from https://www.unicode.org/Public/16.0.0/ucd/IdentifierStatus.txt +-- autogenerated from https://www.unicode.org/Public/security/16.0.0/IdentifierStatus.txt -- | -- Module : Unicode.Internal.Char.Security.IdentifierStatus -- Copyright : (c) 2022 Composewell Technologies and Contributors diff --git a/unicode-data-security/lib/Unicode/Internal/Char/Security/IdentifierType.hs b/unicode-data-security/lib/Unicode/Internal/Char/Security/IdentifierType.hs index 1e38aa0..d0fcd24 100644 --- a/unicode-data-security/lib/Unicode/Internal/Char/Security/IdentifierType.hs +++ b/unicode-data-security/lib/Unicode/Internal/Char/Security/IdentifierType.hs @@ -1,4 +1,4 @@ --- autogenerated from https://www.unicode.org/Public/16.0.0/ucd/IdentifierType.txt +-- autogenerated from https://www.unicode.org/Public/security/16.0.0/IdentifierType.txt -- | -- Module : Unicode.Internal.Char.Security.IdentifierType -- Copyright : (c) 2022 Composewell Technologies and Contributors diff --git a/unicode-data-security/lib/Unicode/Internal/Char/Security/IntentionalConfusables.hs b/unicode-data-security/lib/Unicode/Internal/Char/Security/IntentionalConfusables.hs index f9be786..e435913 100644 --- a/unicode-data-security/lib/Unicode/Internal/Char/Security/IntentionalConfusables.hs +++ b/unicode-data-security/lib/Unicode/Internal/Char/Security/IntentionalConfusables.hs @@ -1,4 +1,4 @@ --- autogenerated from https://www.unicode.org/Public/16.0.0/ucd/intentional.txt +-- autogenerated from https://www.unicode.org/Public/security/16.0.0/intentional.txt -- | -- Module : Unicode.Internal.Char.Security.IntentionalConfusables -- Copyright : (c) 2022 Composewell Technologies and Contributors diff --git a/unicode-data-security/unicode-data-security.cabal b/unicode-data-security/unicode-data-security.cabal index 7c1bd16..9f95ead 100644 --- a/unicode-data-security/unicode-data-security.cabal +++ b/unicode-data-security/unicode-data-security.cabal @@ -20,8 +20,7 @@ copyright: 2022 Composewell Technologies and Contributors category: Data,Text,Unicode stability: Experimental build-type: Simple -tested-with: GHC==8.0.2 - , GHC==8.2.2 +tested-with: GHC==8.2.2 , GHC==8.4.4 , GHC==8.6.5 , GHC==8.8.4 diff --git a/unicode-data/Changelog.md b/unicode-data/Changelog.md index cc45437..0d8cac7 100644 --- a/unicode-data/Changelog.md +++ b/unicode-data/Changelog.md @@ -1,6 +1,6 @@ # Changelog -## 0.7.0 (August 2025) +## 0.7.0 (September 2025) - Updated to [Unicode 16.0.0](https://www.unicode.org/versions/Unicode16.0.0/). diff --git a/unicode-data/test/ICU/CharSpec.hs b/unicode-data/test/ICU/CharSpec.hs index 2492854..b259471 100644 --- a/unicode-data/test/ICU/CharSpec.hs +++ b/unicode-data/test/ICU/CharSpec.hs @@ -5,6 +5,8 @@ module ICU.CharSpec ) where import Control.Applicative (Alternative(..)) +import Data.Bits (Bits(..)) +import qualified Data.Char as Char import Data.Foldable (traverse_) import Data.Version (showVersion, versionBranch) import Numeric (showHex) @@ -35,6 +37,12 @@ spec = do ourUnicodeVersion = versionBranch U.unicodeVersion theirUnicodeVersion = versionBranch ICU.unicodeVersion showCodePoint c = ("U+" ++) . fmap U.toUpper . showHex (U.ord c) + -- Check if the character is not assigned in exactly one Unicode version. + isUnassigned c = (U.generalCategory c == U.NotAssigned) + `xor` (ICU.toGeneralCategory (ICU.charType c) == Char.NotAssigned) + -- Check if the character has changed its general category + hasDifferentCategory c = fromEnum (U.generalCategory c) + /= fromEnum (ICU.toGeneralCategory (ICU.charType c)) -- There is no feature to display warnings other than `trace`, so -- hack our own: @@ -61,8 +69,11 @@ spec = do | n == nRef = acc -- Unicode version mismatch: char is not mapped in one of the libs: -- add warning. - | age' > ourUnicodeVersion || age' > theirUnicodeVersion - = acc{warnings=c : warnings acc} + | age' > ourUnicodeVersion || age' > theirUnicodeVersion || + isUnassigned c + = acc{warnings=(c, Unassigned) : warnings acc} + | hasDifferentCategory c + = acc{warnings=(c, CategoryChange) : warnings acc} -- Error | otherwise = let !msg = mconcat @@ -75,14 +86,18 @@ spec = do !nRef = fRef c age = ICU.charAge c age' = take 3 (versionBranch age) - mkWarning c = it (showCodePoint c "") . pendingWith $ mconcat + mkWarning (c, reason) = it (showCodePoint c "") . pendingWith $ mconcat [ "Incompatible ICU Unicode version: expected " , showVersion U.unicodeVersion , ", got: " , showVersion ICU.unicodeVersion - , " (ICU character age is: " - , showVersion (ICU.charAge c) - , ")" ] + , case reason of + Unassigned -> mconcat + [ " (ICU character age is: " + , showVersion (ICU.charAge c) + , ")" ] + CategoryChange -> " (different general category)" + ] -- | Helper to compare our GeneralCategory to 'Data.Char.GeneralCategory'. data GeneralCategory = forall c. (Show c, Enum c) => GeneralCategory c @@ -93,6 +108,12 @@ instance Show GeneralCategory where instance Eq GeneralCategory where GeneralCategory a == GeneralCategory b = fromEnum a == fromEnum b +data MismatchReason + = Unassigned + | CategoryChange + -- | Warning accumulator -data Acc = Acc { warnings :: ![Char], firstError :: !(Maybe String) } +data Acc = Acc + { warnings :: ![(Char, MismatchReason)] + , firstError :: !(Maybe String) } diff --git a/unicode-data/unicode-data.cabal b/unicode-data/unicode-data.cabal index 86456ad..f2ac615 100644 --- a/unicode-data/unicode-data.cabal +++ b/unicode-data/unicode-data.cabal @@ -20,8 +20,7 @@ copyright: 2020 Composewell Technologies and Contributors category: Data,Text,Unicode stability: Experimental build-type: Simple -tested-with: GHC==8.0.2 - , GHC==8.2.2 +tested-with: GHC==8.2.2 , GHC==8.4.4 , GHC==8.6.5 , GHC==8.8.4