Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions .github/workflows/haskell.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ jobs:
fail-fast: false
matrix:
name:
- 8.0.2
- 8.2.2
- 8.4.4
- 8.6.5
Expand All @@ -65,14 +64,6 @@ jobs:
- hlint
include:

- name: 8.0.2
use_haskell_actions: true
ghc_version: 8.0.2
runner: ubuntu-latest
cabal_version: 3.2.0.0
pack_options: DISABLE_TEST=y
ignore_error: false

- name: 8.2.2
use_haskell_actions: true
ghc_version: 8.2.2
Expand Down
4 changes: 4 additions & 0 deletions experimental/icu/cbits/icu.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ int32_t __hs_uscript_getScriptExtensions
return uscript_getScriptExtensions(codepoint, scripts, capacity, &err);
}

int __hs_getMaxScript(void) {
return u_getIntPropertyMaxValue(UCHAR_SCRIPT);
}

const char * __hs_uscript_getShortName(UScriptCode scriptCode) {
return uscript_getShortName(scriptCode);
}
2 changes: 2 additions & 0 deletions experimental/icu/cbits/icu.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ int32_t __hs_uscript_getScriptExtensions
, UScriptCode * scripts
, int32_t capacity );

int __hs_getMaxScript(void);

const char * __hs_uscript_getShortName(UScriptCode scriptCode);

#endif
22 changes: 17 additions & 5 deletions experimental/icu/lib/ICU/Scripts.hsc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

module ICU.Scripts
( Script(..)
, maxSupportedScript
, script
, codepointScript
, scriptShortName
Expand Down Expand Up @@ -35,9 +36,15 @@ foreign import ccall safe "icu.h __hs_uscript_getScript" uscript_getScript
foreign import ccall unsafe "icu.h __hs_uscript_getScriptExtensions" uscript_getScriptExtensions
:: UChar32 -> Ptr UScriptCode -> Int32 -> IO Int32

foreign import ccall unsafe "icu.h __hs_getMaxScript" getMaxScript
:: IO CInt

foreign import ccall unsafe "icu.h __hs_uscript_getShortName" uscript_getShortName
:: UScriptCode -> IO CString

maxSupportedScript :: Script
maxSupportedScript = toEnum (fromIntegral (unsafePerformIO getMaxScript))

{-# INLINE codepointScript #-}
codepointScript :: Word32 -> Script
-- codepointScript = toEnum . unsafePerformIO . with 0 . uscript_getScript
Expand Down Expand Up @@ -65,13 +72,13 @@ scriptExtensionsRaw
capacity = 30

scriptShortName :: Script -> String
scriptShortName
= unsafePerformIO
. (uscript_getShortName . fromIntegral . fromEnum >=> peekCString)
scriptShortName s = if s <= maxSupportedScript
then unsafePerformIO ((uscript_getShortName . fromIntegral . fromEnum >=> peekCString) s)
else ""


-- See: https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/uscript_8h_source.html
-- Last sync: 2023-03-09
-- Last sync: 2025-09-13

data Script
= Common -- ^ USCRIPT_COMMON = 0
Expand Down Expand Up @@ -282,4 +289,9 @@ data Script
| Sunu -- ^ USCRIPT_SUNUWAR = 205
| Todr -- ^ USCRIPT_TODHRI = 206
| Tutg -- ^ USCRIPT_TULU_TIGALARI = 207
deriving (Bounded, Enum, Eq, Show)
| Berf -- ^ USCRIPT_BERIA_ERFE = 208
| Sidt -- ^ USCRIPT_SIDETIC = 209
| Tayo -- ^ USCRIPT_TAI_YO = 210
| Tols -- ^ USCRIPT_TOLONG_SIKI = 211
| Hntl -- ^ USCRIPT_TRADITIONAL_HAN_WITH_LATIN = 212
deriving (Bounded, Enum, Eq, Ord, Show)
3 changes: 1 addition & 2 deletions experimental/unicode-data-text/unicode-data-text.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ copyright: 2022 Composewell Technologies and Contributors
category: Data,Text,Unicode
stability: Experimental
build-type: Simple
tested-with: GHC==8.0.2
, GHC==8.2.2
tested-with: GHC==8.2.2
, GHC==8.4.4
, GHC==8.6.5
, GHC==8.8.4
Expand Down
37 changes: 26 additions & 11 deletions ucd2haskell/exe/UCD2Haskell/Generator.hs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ module UCD2Haskell.Generator
( -- * Recipe
FileRecipe(..)
-- * Generator
, UnicodeSourceType(..)
, runGenerator
, moduleToFileName
, dirFromFileName
Expand Down Expand Up @@ -49,7 +50,7 @@ import Data.Maybe (mapMaybe)
import Data.Ratio ((%))
import qualified Data.Set as Set
import Data.String (IsString (..))
import Data.Version (Version, showVersion)
import Data.Version (Version, makeVersion, showVersion)
import qualified Data.Vector.Unboxed as V
import Data.Word (Word32, Word8)
import Debug.Trace (trace)
Expand Down Expand Up @@ -99,25 +100,38 @@ type GeneratorRecipe a = [FileRecipe a]
-- Generator
--------------------------------------------------------------------------------

data UnicodeSourceType = UCD | Security

moduleToFileName :: String -> String
moduleToFileName = map (\x -> if x == '.' then '/' else x)

dirFromFileName :: String -> String
dirFromFileName = reverse . dropWhile (/= '/') . reverse

moduleFileEmitter :: Version -> FilePath -> FilePath -> ModuleRecipe a -> Fold a (IO ())
moduleFileEmitter version unicodeSourceFile outdir (modName, fldGen) =
moduleFileEmitter :: Version -> UnicodeSourceType -> FilePath -> FilePath -> ModuleRecipe a -> Fold a (IO ())
moduleFileEmitter version sourceType unicodeSourceFile outdir (modName, fldGen) =
rmapFold action $ fldGen (BB.string7 modName)

where

pretext = mconcat
[ "-- autogenerated from https://www.unicode.org/Public/"
, BB.string7 (showVersion version)
, "/ucd/"
, BB.string7 unicodeSourceFile
,"\n"
]
$ "-- autogenerated from https://www.unicode.org/Public/"
: case sourceType of
Security | version < makeVersion [17, 0, 0] ->
[ "security/"
, BB.string7 (showVersion version)
, "/"
, BB.string7 unicodeSourceFile
, "\n"
]
_ ->
[ BB.string7 (showVersion version)
, case sourceType of
UCD -> "/ucd/"
Security -> "/security/"
, BB.string7 unicodeSourceFile
, "\n"
]
outfile = outdir </> moduleToFileName modName <.> "hs"
outfiledir = dirFromFileName outfile
action c = do
Expand All @@ -133,14 +147,15 @@ printCpuTime = do

runGenerator ::
Version
-> UnicodeSourceType
-> FilePath
-> FilePath
-> (B.ByteString -> [a])
-> FilePath
-> [String]
-> GeneratorRecipe a
-> IO ()
runGenerator version indir file transformLines outdir patterns recipes = do
runGenerator version sourceType indir file transformLines outdir patterns recipes = do
raw <- B.readFile (indir </> file)
sequence_ (runFold combinedFld (transformLines raw))

Expand All @@ -149,7 +164,7 @@ runGenerator version indir file transformLines outdir patterns recipes = do
generatedFolds = mapMaybe toModuleEmitter recipes
toModuleEmitter = \case
ModuleRecipe name f -> if all (`L.isSubsequenceOf` name) patterns
then Just (moduleFileEmitter version file outdir (name, f))
then Just (moduleFileEmitter version sourceType file outdir (name, f))
else Nothing
combinedFld = distribute generatedFolds

Expand Down
4 changes: 2 additions & 2 deletions ucd2haskell/exe/UCD2Haskell/Generator/Core.hs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import qualified UCD2Haskell.Modules.UnicodeData.Decomposition as Decomposition
import qualified UCD2Haskell.Modules.UnicodeData.GeneralCategory as GeneralCategory
import qualified UCD2Haskell.Modules.UnicodeData.SimpleCaseMappings as SimpleCaseMappings
import qualified UCD2Haskell.Modules.Version as Version
import UCD2Haskell.Generator (runGenerator)
import UCD2Haskell.Generator (UnicodeSourceType(..), runGenerator)

generateModules :: Version -> FilePath -> FilePath -> [String] -> [String] -> IO ()
generateModules version indir outdir patterns props = do
Expand All @@ -44,7 +44,7 @@ generateModules version indir outdir patterns props = do
specialCasings <- SpecialCasings.parse
<$> B.readFile (indir </> "SpecialCasing.txt")

let runGenerator' = runGenerator version indir
let runGenerator' = runGenerator version UCD indir

runGenerator'
"Blocks.txt"
Expand Down
4 changes: 3 additions & 1 deletion ucd2haskell/exe/UCD2Haskell/Generator/Names.hs
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@ import qualified Unicode.CharacterDatabase.Parser.NameAliases as NA
import qualified UCD2Haskell.Modules.UnicodeData.DerivedNames as Names
import qualified UCD2Haskell.Modules.UnicodeData.NameAliases as NameAliases
import qualified UCD2Haskell.Modules.Version as Version
import UCD2Haskell.Generator (runGenerator)
import UCD2Haskell.Generator (UnicodeSourceType(..), runGenerator)

generateModules :: Version -> FilePath -> FilePath -> [String] -> IO ()
generateModules version indir outdir patterns = do
runGenerator
version
UCD
indir
("extracted" </> "DerivedName.txt")
N.parse
Expand All @@ -31,6 +32,7 @@ generateModules version indir outdir patterns = do

runGenerator
version
UCD
indir
"NameAliases.txt"
NA.parse
Expand Down
3 changes: 2 additions & 1 deletion ucd2haskell/exe/UCD2Haskell/Generator/Scripts.hs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import qualified Unicode.CharacterDatabase.Parser.Properties.Single as Prop
import qualified UCD2Haskell.Modules.Scripts as Scripts
import qualified UCD2Haskell.Modules.ScriptsExtensions as ScriptsExtensions
import qualified UCD2Haskell.Modules.Version as Version
import UCD2Haskell.Generator (runGenerator)
import UCD2Haskell.Generator (UnicodeSourceType(..), runGenerator)

generateModules :: Version -> FilePath -> FilePath -> [String] -> IO ()
generateModules version indir outdir patterns = do
Expand All @@ -28,6 +28,7 @@ generateModules version indir outdir patterns = do

runGenerator
version
UCD
indir
"Scripts.txt"
Prop.parse
Expand Down
6 changes: 5 additions & 1 deletion ucd2haskell/exe/UCD2Haskell/Generator/Security.hs
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@ import qualified UCD2Haskell.Modules.Security.IdentifierStatus as IdentifierStat
import qualified UCD2Haskell.Modules.Security.IdentifierType as IdentifierType
import qualified UCD2Haskell.Modules.Security.IntentionalConfusables as IntentionalConfusables
import qualified UCD2Haskell.Modules.Version as Version
import UCD2Haskell.Generator (runGenerator)
import UCD2Haskell.Generator (UnicodeSourceType(..), runGenerator)

generateModules :: Version -> FilePath -> FilePath -> [String] -> IO ()
generateModules version indir outdir patterns = do
runGenerator
version
Security
indir
"IdentifierStatus.txt"
Prop.parse
Expand All @@ -31,6 +32,7 @@ generateModules version indir outdir patterns = do

runGenerator
version
Security
indir
"IdentifierType.txt"
Prop.parse
Expand All @@ -40,6 +42,7 @@ generateModules version indir outdir patterns = do

runGenerator
version
Security
indir
"confusables.txt"
Prop.parseMultipleValues
Expand All @@ -49,6 +52,7 @@ generateModules version indir outdir patterns = do

runGenerator
version
Security
indir
"intentional.txt"
Prop.parse
Expand Down
24 changes: 19 additions & 5 deletions ucd2haskell/exe/UCD2Haskell/Modules/UnicodeData/DerivedNames.hs
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,16 @@ genNamesModule moduleName = Fold step initial done

step acc = \case
N.SingleChar{..} -> step' acc char name
N.CharRange{..} -> foldl'
(\a c -> step' a c (mkName prefix c))
acc
[start..end]
N.CharRange{..} -> if prefix `elem` rangePrefixes
then foldl'
(\a c -> step' a c (mkName prefix c))
acc
[start..end]
else error . mconcat $
[ "Unexpected name range: "
, show prefix
, ". Please update the generator and the "
, "Unicode.Char.General.Names* modules" ]

mkName prefix c = prefix <> showHexCodepointBS c

Expand Down Expand Up @@ -95,14 +101,22 @@ genNamesModule moduleName = Fold step initial done
nushu = 0xf5
hangul = 0x80

rangePrefixes =
[ "CJK COMPATIBILITY IDEOGRAPH-"
, "CJK UNIFIED IDEOGRAPH-"
, "TANGUT IDEOGRAPH-"
, "EGYPTIAN HIEROGLYPH-"
, "KHITAN SMALL SCRIPT CHARACTER-"
, "NUSHU CHARACTER-" ]

encodeName name
| BS.take 28 name == "CJK COMPATIBILITY IDEOGRAPH-" = ("", cjkCompat, 0, True)
| BS.take 22 name == "CJK UNIFIED IDEOGRAPH-" = ("", cjkUnified, 0, True)
| BS.take 17 name == "TANGUT IDEOGRAPH-" = ("", tangut, 0, True)
| BS.take 20 name == "EGYPTIAN HIEROGLYPH-" = ("", egyptianHieroglyph, 0, True)
| BS.take 30 name == "KHITAN SMALL SCRIPT CHARACTER-" = ("", khitan, 0, True)
| BS.take 16 name == "NUSHU CHARACTER-" = ("", nushu, 0, True)
| BS.take 16 name == "HANGUL SYLLABLE " =
| BS.take 16 name == "HANGUL SYLLABLE " =
let !name' = BS.drop 16 name; !len = BS.length name'
in if len <= 12
then (name', hangul + len, len, True)
Expand Down
2 changes: 1 addition & 1 deletion unicode-data-names/Changelog.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Changelog

## 0.5.0 (August 2025)
## 0.5.0 (September 2025)

- Updated to [Unicode 16.0.0](https://www.unicode.org/versions/Unicode16.0.0/).

Expand Down
4 changes: 4 additions & 0 deletions unicode-data-names/test/ICU/NamesSpec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ spec = do
#endif
where
ourUnicodeVersion = versionBranch U.unicodeVersion
theirUnicodeVersion = take 3 (versionBranch ICU.unicodeVersion)
versionMismatch = ourUnicodeVersion /= theirUnicodeVersion
showCodePoint c = ("U+" ++) . fmap U.toUpper . showHex (U.ord c)

-- There is no feature to display warnings other than `trace`, so
Expand Down Expand Up @@ -85,6 +87,8 @@ spec = do
-- Unicode version mismatch: char is not mapped in one of the libs:
-- add warning.
| ageMismatch c = acc{warnings=c : warnings acc}
-- Unicode version mismatch
| versionMismatch = acc{warnings=c : warnings acc}
-- Error
| otherwise =
let !msg = mconcat
Expand Down
3 changes: 1 addition & 2 deletions unicode-data-names/unicode-data-names.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ copyright: 2022 Composewell Technologies and Contributors
category: Data,Text,Unicode
stability: Experimental
build-type: Simple
tested-with: GHC==8.0.2
, GHC==8.2.2
tested-with: GHC==8.2.2
, GHC==8.4.4
, GHC==8.6.5
, GHC==8.8.4
Expand Down
2 changes: 1 addition & 1 deletion unicode-data-scripts/Changelog.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Changelog

## 0.5.0 (August 2025)
## 0.5.0 (September 2025)

- Updated to [Unicode 16.0.0](https://www.unicode.org/versions/Unicode16.0.0/).

Expand Down
Loading