11using System . Net . Http ;
22using System . Text ;
3+ using System . Linq ;
34using NUnit . Framework ;
45
56namespace SharpToken . Tests ;
67
78public class Tests
89{
9- private static readonly List < string > ModelsList = new ( ) { "p50k_base" , "r50k_base" , "cl100k_base" , "o200k_base" } ;
10+ private static readonly List < string > ModelsList = new ( ) { "p50k_base" , "r50k_base" , "cl100k_base" , "o200k_base" , "o200k_harmony" } ;
1011
1112 private static readonly List < Tuple < string , string , List < int > > > TestData =
1213 TestHelpers . ReadTestPlans ( "SharpToken.Tests.data.TestPlans.txt" ) ;
@@ -23,7 +24,19 @@ public void TestEncodingAndDecoding(Tuple<string, string, List<int>> resource)
2324 var ( encodingName , textToEncode , expectedEncoded ) = resource ;
2425
2526 var encoding = GptEncoding . GetEncoding ( encodingName ) ;
26- var encoded = encoding . Encode ( textToEncode ) ;
27+
28+ // Detect if the text contains special tokens
29+ var allowedSpecial = new HashSet < string > ( ) ;
30+ var specialTokens = GetSpecialTokensForEncoding ( encodingName ) ;
31+ foreach ( var token in specialTokens )
32+ {
33+ if ( textToEncode . Contains ( token ) )
34+ {
35+ allowedSpecial . Add ( token ) ;
36+ }
37+ }
38+
39+ var encoded = encoding . Encode ( textToEncode , allowedSpecial ) ;
2740 var decodedText = encoding . Decode ( encoded ) ;
2841 Assert . Multiple ( ( ) =>
2942 {
@@ -39,7 +52,19 @@ public void TestTokensLength(Tuple<string, string, List<int>> resource)
3952 var ( encodingName , textToEncode , expectedEncoded ) = resource ;
4053
4154 var encoding = GptEncoding . GetEncoding ( encodingName ) ;
42- var tokenLength = encoding . CountTokens ( textToEncode ) ;
55+
56+ // Detect if the text contains special tokens
57+ var allowedSpecial = new HashSet < string > ( ) ;
58+ var specialTokens = GetSpecialTokensForEncoding ( encodingName ) ;
59+ foreach ( var token in specialTokens )
60+ {
61+ if ( textToEncode . Contains ( token ) )
62+ {
63+ allowedSpecial . Add ( token ) ;
64+ }
65+ }
66+
67+ var tokenLength = encoding . CountTokens ( textToEncode , allowedSpecial ) ;
4368 Assert . Multiple ( ( ) =>
4469 {
4570 Assert . That ( tokenLength , Is . EqualTo ( expectedEncoded . Count ) ) ;
@@ -53,7 +78,19 @@ public async Task TestEncodingAndDecodingInParallel()
5378 {
5479 var ( encodingName , textToEncode , expectedEncoded ) = _ ;
5580 var encoding = GptEncoding . GetEncoding ( encodingName ) ;
56- var encoded = encoding . Encode ( textToEncode ) ;
81+
82+ // Detect if the text contains special tokens
83+ var allowedSpecial = new HashSet < string > ( ) ;
84+ var specialTokens = GetSpecialTokensForEncoding ( encodingName ) ;
85+ foreach ( var token in specialTokens )
86+ {
87+ if ( textToEncode . Contains ( token ) )
88+ {
89+ allowedSpecial . Add ( token ) ;
90+ }
91+ }
92+
93+ var encoded = encoding . Encode ( textToEncode , allowedSpecial ) ;
5794 var decodedText = encoding . Decode ( encoded ) ;
5895 return ( textToEncode , encoded , expectedEncoded , decodedText ) ;
5996 } ) ) ;
@@ -162,6 +199,13 @@ static void TestModelPrefixMappingFailsAction()
162199 [ TestCaseSource ( nameof ( ModelsList ) ) ]
163200 public async Task TestLocalResourceMatchesRemoteResource ( string modelName )
164201 {
202+ // Skip o200k_harmony as it reuses o200k_base.tiktoken and doesn't have its own remote file
203+ if ( modelName == "o200k_harmony" )
204+ {
205+ Assert . Pass ( "o200k_harmony reuses o200k_base.tiktoken file and doesn't have its own remote file" ) ;
206+ return ;
207+ }
208+
165209 var embeddedResourceName = $ "SharpToken.data.{ modelName } .tiktoken";
166210 var remoteResourceUrl = $ "https://openaipublic.blob.core.windows.net/encodings/{ modelName } .tiktoken";
167211
@@ -199,4 +243,63 @@ public void TestEncodingForModel()
199243 Assert . That ( decodedText , Is . EqualTo ( inputText ) ) ;
200244 } ) ;
201245 }
246+
247+ [ Test ]
248+ public void TestO200KHarmonySpecialTokens ( )
249+ {
250+ var encoding = GptEncoding . GetEncoding ( "o200k_harmony" ) ;
251+ const string inputText = "Hello, world!" ;
252+
253+ // Test basic encoding/decoding
254+ var encoded = encoding . Encode ( inputText ) ;
255+ var decodedText = encoding . Decode ( encoded ) ;
256+ Assert . That ( decodedText , Is . EqualTo ( inputText ) ) ;
257+
258+ // Test that o200k_harmony has more special tokens than o200k_base
259+ var baseEncoding = GptEncoding . GetEncoding ( "o200k_base" ) ;
260+
261+ // Test encoding with special tokens
262+ var textWithSpecialTokens = "Hello <|startoftext|> world <|call|> test <|reserved_200020|>" ;
263+ var encodedSpecial = encoding . Encode ( textWithSpecialTokens , allowedSpecial : new HashSet < string > { "<|startoftext|>" , "<|call|>" , "<|reserved_200020|>" } ) ;
264+ var decodedSpecial = encoding . Decode ( encodedSpecial ) ;
265+
266+ Assert . That ( decodedSpecial , Is . EqualTo ( textWithSpecialTokens ) ) ;
267+
268+ // Verify specific special token IDs
269+ Assert . That ( encoding . Encode ( "<|startoftext|>" , allowedSpecial : new HashSet < string > { "<|startoftext|>" } ) , Is . EqualTo ( new List < int > { 199998 } ) ) ;
270+ Assert . That ( encoding . Encode ( "<|call|>" , allowedSpecial : new HashSet < string > { "<|call|>" } ) , Is . EqualTo ( new List < int > { 200012 } ) ) ;
271+ Assert . That ( encoding . Encode ( "<|reserved_200020|>" , allowedSpecial : new HashSet < string > { "<|reserved_200020|>" } ) , Is . EqualTo ( new List < int > { 200020 } ) ) ;
272+ }
273+
274+ [ Test ]
275+ public void TestGPT5ModelMappings ( )
276+ {
277+ // Test that GPT-5 models map to the correct encodings
278+ Assert . That ( Model . GetEncodingNameForModel ( "gpt-5" ) , Is . EqualTo ( "o200k_base" ) ) ;
279+ Assert . That ( Model . GetEncodingNameForModel ( "gpt-5-mini" ) , Is . EqualTo ( "o200k_base" ) ) ;
280+ Assert . That ( Model . GetEncodingNameForModel ( "gpt-5-nano" ) , Is . EqualTo ( "o200k_base" ) ) ;
281+ Assert . That ( Model . GetEncodingNameForModel ( "gpt-5-pro" ) , Is . EqualTo ( "o200k_base" ) ) ;
282+ Assert . That ( Model . GetEncodingNameForModel ( "gpt-5-thinking" ) , Is . EqualTo ( "o200k_base" ) ) ;
283+
284+ // Test prefix matching for GPT-5 variants
285+ Assert . That ( Model . GetEncodingNameForModel ( "gpt-5-2024-08-07" ) , Is . EqualTo ( "o200k_base" ) ) ;
286+ Assert . That ( Model . GetEncodingNameForModel ( "gpt-5-chat-latest" ) , Is . EqualTo ( "o200k_base" ) ) ;
287+ }
288+
289+ private static HashSet < string > GetSpecialTokensForEncoding ( string encodingName )
290+ {
291+ return encodingName switch
292+ {
293+ "r50k_base" or "p50k_base" => new HashSet < string > { "<|endoftext|>" } ,
294+ "p50k_edit" => new HashSet < string > { "<|endoftext|>" , "<|fim_prefix|>" , "<|fim_middle|>" , "<|fim_suffix|>" } ,
295+ "cl100k_base" => new HashSet < string > { "<|endoftext|>" , "<|fim_prefix|>" , "<|fim_middle|>" , "<|fim_suffix|>" , "<|endofprompt|>" } ,
296+ "o200k_base" => new HashSet < string > { "<|endoftext|>" , "<|endofprompt|>" } ,
297+ "o200k_harmony" => new HashSet < string >
298+ {
299+ "<|endoftext|>" , "<|endofprompt|>" , "<|startoftext|>" , "<|return|>" , "<|constrain|>" ,
300+ "<|channel|>" , "<|start|>" , "<|end|>" , "<|message|>" , "<|call|>"
301+ } . Union ( Enumerable . Range ( 200000 , 1088 ) . Select ( i => $ "<|reserved_{ i } |>") ) . ToHashSet ( ) ,
302+ _ => new HashSet < string > ( )
303+ } ;
304+ }
202305}
0 commit comments