Skip to content

Commit 3eb45d0

Browse files
new encoding support
1 parent c478ba9 commit 3eb45d0

File tree

6 files changed

+726
-58
lines changed

6 files changed

+726
-58
lines changed

README.md

Lines changed: 37 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
[![Contributors](https://img.shields.io/github/contributors/dmitry-brazhenko/SharpToken.svg)](https://github.com/dmitry-brazhenko/SharpToken/graphs/contributors)
99
[![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
1010

11-
12-
1311
SharpToken is a C# library that serves as a port of the Python [tiktoken](https://github.com/openai/tiktoken) library.
1412
It provides functionality for encoding and decoding tokens using GPT-based encodings. This library is built for .NET 6, .NET 8
1513
and .NET Standard 2.0, making it compatible with a wide range of frameworks.
@@ -74,11 +72,12 @@ var count = encoding.CountTokens("Hello, world!"); // Output: 4
7472

7573
SharpToken currently supports the following models:
7674

77-
* `r50k_base`
78-
* `p50k_base`
79-
* `p50k_edit`
80-
* `cl100k_base`
81-
* `o200k_base`
75+
- `r50k_base`
76+
- `p50k_base`
77+
- `p50k_edit`
78+
- `cl100k_base`
79+
- `o200k_base`
80+
- `o200k_harmony`
8281

8382
You can use any of these models when creating an instance of GptEncoding:
8483

@@ -88,6 +87,7 @@ var p50kBaseEncoding = GptEncoding.GetEncoding("p50k_base");
8887
var p50kEditEncoding = GptEncoding.GetEncoding("p50k_edit");
8988
var cl100kBaseEncoding = GptEncoding.GetEncoding("cl100k_base");
9089
var o200kBaseEncoding = GptEncoding.GetEncoding("o200k_base");
90+
var o200kHarmonyEncoding = GptEncoding.GetEncoding("o200k_harmony");
9191
```
9292

9393
### Model Prefix Matching
@@ -96,14 +96,17 @@ Apart from specifying direct model names, SharpToken also provides functionality
9696

9797
Here are the current supported prefixes and their corresponding encodings:
9898

99-
| Model Prefix | Encoding |
100-
|---------------------|------------|
101-
| `gpt-4o` | `o200k_base` |
102-
| `gpt-4-` | `cl100k_base` |
103-
| `gpt-3.5-turbo-` | `cl100k_base` |
104-
| `gpt-35-turbo` | `cl100k_base` |
99+
| Model Prefix | Encoding |
100+
| ---------------- | ------------- |
101+
| `gpt-5` | `o200k_base` |
102+
| `gpt-4o` | `o200k_base` |
103+
| `gpt-4-` | `cl100k_base` |
104+
| `gpt-3.5-turbo-` | `cl100k_base` |
105+
| `gpt-35-turbo` | `cl100k_base` |
105106

106107
Examples of model names that fall under these prefixes include:
108+
109+
- For the prefix `gpt-5`: `gpt-5`, `gpt-5-mini`, `gpt-5-nano`, `gpt-5-pro`, `gpt-5-thinking`, `gpt-5-2024-08-07`, `gpt-5-chat-latest`, etc.
107110
- For the prefix `gpt-4o`: `gpt-4o`, `gpt-4o-2024-05-13`, etc.
108111
- For the prefix `gpt-4-`: `gpt-4-0314`, `gpt-4-32k`, etc.
109112
- For the prefix `gpt-3.5-turbo-`: `gpt-3.5-turbo-0301`, `gpt-3.5-turbo-0401`, etc.
@@ -117,9 +120,6 @@ string encodingName = Model.GetEncodingNameForModel("gpt-4-0314"); // This will
117120

118121
If the provided model name doesn't match any direct model names or prefixes, the method will return `null`.
119122

120-
121-
122-
123123
## Understanding Encoded Values
124124

125125
When you encode a string using the Encode method, the returned value is a list of integers that represent tokens in the
@@ -289,23 +289,23 @@ BenchmarkDotNet v0.13.9+228a464e8be6c580ad9408e98f18813f6407fb5a, Windows 11 (10
289289
.NET Framework 4.7.1 : .NET Framework 4.8.1 (4.8.9181.0), X64 RyuJIT VectorSize=256
290290
```
291291

292-
| Method | Job | Runtime | Mean | Error | StdDev | Median | Gen0 | Gen1 | Allocated |
293-
|------------------ |--------------------- |--------------------- |----------:|---------:|----------:|----------:|-----------:|----------:|----------:|
294-
| **MLTokenizers** | .NET 8.0 | .NET 8.0 | 60.55 ms | 1.143 ms | 1.123 ms | 60.45 ms | 1000.0000 | - | 13.12 MB |
295-
| **MLTokenizers** | .NET 6.0 | .NET 6.0 | 95.75 ms | 1.374 ms | 1.147 ms | 95.54 ms | 10500.0000 | - | 126.19 MB |
296-
| **MLTokenizers** | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 291.77 ms | 5.811 ms | 11.195 ms | 291.64 ms | 21000.0000 | - | 127.33 MB |
297-
| | | | | | | | | | |
298-
| *SharpToken* | .NET 8.0 | .NET 8.0 | 87.78 ms | 1.700 ms | 1.590 ms | 87.34 ms | 1000.0000 | - | 22.13 MB |
299-
| *SharpToken* | .NET 6.0 | .NET 6.0 | 128.84 ms | 1.718 ms | 1.607 ms | 128.17 ms | 16250.0000 | 500.0000 | 196.31 MB |
300-
| *SharpToken* | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 356.21 ms | 6.843 ms | 10.854 ms | 355.09 ms | 34000.0000 | 1000.0000 | 204.39 MB |
301-
| | | | | | | | | | |
302-
| *TokenizerLib* | .NET 8.0 | .NET 8.0 | 109.26 ms | 2.082 ms | 4.482 ms | 107.90 ms | 18200.0000 | 600.0000 | 217.82 MB |
303-
| *TokenizerLib* | .NET 6.0 | .NET 6.0 | 126.16 ms | 2.959 ms | 8.630 ms | 122.34 ms | 18000.0000 | 500.0000 | 217.82 MB |
304-
| *TokenizerLib* | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 374.71 ms | 7.374 ms | 16.794 ms | 370.12 ms | 40000.0000 | 1000.0000 | 243.79 MB |
305-
| | | | | | | | | | |
306-
| *TiktokenSharp* | .NET 8.0 | .NET 8.0 | 177.34 ms | 3.506 ms | 8.797 ms | 174.98 ms | 28000.0000 | 1000.0000 | 338.98 MB |
307-
| *TiktokenSharp* | .NET 6.0 | .NET 6.0 | 196.17 ms | 3.912 ms | 8.422 ms | 195.52 ms | 26000.0000 | 666.6667 | 313.26 MB |
308-
| *TiktokenSharp* | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 488.22 ms | 9.696 ms | 15.931 ms | 487.17 ms | 63000.0000 | 1000.0000 | 378.31 MB |
292+
| Method | Job | Runtime | Mean | Error | StdDev | Median | Gen0 | Gen1 | Allocated |
293+
| ---------------- | -------------------- | -------------------- | --------: | -------: | --------: | --------: | ---------: | --------: | --------: |
294+
| **MLTokenizers** | .NET 8.0 | .NET 8.0 | 60.55 ms | 1.143 ms | 1.123 ms | 60.45 ms | 1000.0000 | - | 13.12 MB |
295+
| **MLTokenizers** | .NET 6.0 | .NET 6.0 | 95.75 ms | 1.374 ms | 1.147 ms | 95.54 ms | 10500.0000 | - | 126.19 MB |
296+
| **MLTokenizers** | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 291.77 ms | 5.811 ms | 11.195 ms | 291.64 ms | 21000.0000 | - | 127.33 MB |
297+
| | | | | | | | | | |
298+
| _SharpToken_ | .NET 8.0 | .NET 8.0 | 87.78 ms | 1.700 ms | 1.590 ms | 87.34 ms | 1000.0000 | - | 22.13 MB |
299+
| _SharpToken_ | .NET 6.0 | .NET 6.0 | 128.84 ms | 1.718 ms | 1.607 ms | 128.17 ms | 16250.0000 | 500.0000 | 196.31 MB |
300+
| _SharpToken_ | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 356.21 ms | 6.843 ms | 10.854 ms | 355.09 ms | 34000.0000 | 1000.0000 | 204.39 MB |
301+
| | | | | | | | | | |
302+
| _TokenizerLib_ | .NET 8.0 | .NET 8.0 | 109.26 ms | 2.082 ms | 4.482 ms | 107.90 ms | 18200.0000 | 600.0000 | 217.82 MB |
303+
| _TokenizerLib_ | .NET 6.0 | .NET 6.0 | 126.16 ms | 2.959 ms | 8.630 ms | 122.34 ms | 18000.0000 | 500.0000 | 217.82 MB |
304+
| _TokenizerLib_ | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 374.71 ms | 7.374 ms | 16.794 ms | 370.12 ms | 40000.0000 | 1000.0000 | 243.79 MB |
305+
| | | | | | | | | | |
306+
| _TiktokenSharp_ | .NET 8.0 | .NET 8.0 | 177.34 ms | 3.506 ms | 8.797 ms | 174.98 ms | 28000.0000 | 1000.0000 | 338.98 MB |
307+
| _TiktokenSharp_ | .NET 6.0 | .NET 6.0 | 196.17 ms | 3.912 ms | 8.422 ms | 195.52 ms | 26000.0000 | 666.6667 | 313.26 MB |
308+
| _TiktokenSharp_ | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 488.22 ms | 9.696 ms | 15.931 ms | 487.17 ms | 63000.0000 | 1000.0000 | 378.31 MB |
309309

310310
## Performance
311311

@@ -315,15 +315,16 @@ It uses modern multibyte CPU instructions and almost no heap allocations.
315315
All core methods have been tested on a large and a small input text.
316316

317317
**Inputs:**
318+
318319
- `SmallText`: 453 B (text/plain)
319320
- `LargeText`: 51 KB (text/html)
320321

321322
**Methods:**
323+
322324
- `Encode`: text to tokens
323325
- `Decode`: tokens to text
324326
- `CountTokens`: high performance API to count tokens of text
325327

326-
327328
```
328329
BenchmarkDotNet v0.13.12, Windows 11 (10.0.22631.3296/23H2/2023Update/SunValley3)
329330
AMD Ryzen 9 3900X, 1 CPU, 24 logical and 12 physical cores
@@ -334,8 +335,8 @@ AMD Ryzen 9 3900X, 1 CPU, 24 logical and 12 physical cores
334335
.NET Framework 4.7.1 : .NET Framework 4.8.1 (4.8.9181.0), X64 RyuJIT VectorSize=256
335336
```
336337

337-
| Method | Mean | Error | StdDev | Ratio | RatioSD | Allocated | Alloc Ratio |
338-
|------------------------- |--------------:|------------:|------------:|------:|--------:|----------:|------------:|
338+
| Method | Mean | Error | StdDev | Ratio | RatioSD | Allocated | Alloc Ratio |
339+
| ------------------------ | ------------: | ----------: | ----------: | ----: | ------: | --------: | ----------: |
339340
| **.NET 8.0** | | | | | | | |
340341
| Encode_SmallText | 22.649 us | 0.4244 us | 0.4359 us | 0.28 | 0.01 | 696 B | 0.02 |
341342
| Encode_LargeText | 4,542.505 us | 87.7988 us | 104.5182 us | 0.24 | 0.01 | 155547 B | 0.03 |

SharpToken.Tests/SharpToken.Tests.cs

Lines changed: 107 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
using System.Net.Http;
22
using System.Text;
3+
using System.Linq;
34
using NUnit.Framework;
45

56
namespace SharpToken.Tests;
67

78
public class Tests
89
{
9-
private static readonly List<string> ModelsList = new() { "p50k_base", "r50k_base", "cl100k_base", "o200k_base" };
10+
private static readonly List<string> ModelsList = new() { "p50k_base", "r50k_base", "cl100k_base", "o200k_base", "o200k_harmony" };
1011

1112
private static readonly List<Tuple<string, string, List<int>>> TestData =
1213
TestHelpers.ReadTestPlans("SharpToken.Tests.data.TestPlans.txt");
@@ -23,7 +24,19 @@ public void TestEncodingAndDecoding(Tuple<string, string, List<int>> resource)
2324
var (encodingName, textToEncode, expectedEncoded) = resource;
2425

2526
var encoding = GptEncoding.GetEncoding(encodingName);
26-
var encoded = encoding.Encode(textToEncode);
27+
28+
// Detect if the text contains special tokens
29+
var allowedSpecial = new HashSet<string>();
30+
var specialTokens = GetSpecialTokensForEncoding(encodingName);
31+
foreach (var token in specialTokens)
32+
{
33+
if (textToEncode.Contains(token))
34+
{
35+
allowedSpecial.Add(token);
36+
}
37+
}
38+
39+
var encoded = encoding.Encode(textToEncode, allowedSpecial);
2740
var decodedText = encoding.Decode(encoded);
2841
Assert.Multiple(() =>
2942
{
@@ -39,7 +52,19 @@ public void TestTokensLength(Tuple<string, string, List<int>> resource)
3952
var (encodingName, textToEncode, expectedEncoded) = resource;
4053

4154
var encoding = GptEncoding.GetEncoding(encodingName);
42-
var tokenLength = encoding.CountTokens(textToEncode);
55+
56+
// Detect if the text contains special tokens
57+
var allowedSpecial = new HashSet<string>();
58+
var specialTokens = GetSpecialTokensForEncoding(encodingName);
59+
foreach (var token in specialTokens)
60+
{
61+
if (textToEncode.Contains(token))
62+
{
63+
allowedSpecial.Add(token);
64+
}
65+
}
66+
67+
var tokenLength = encoding.CountTokens(textToEncode, allowedSpecial);
4368
Assert.Multiple(() =>
4469
{
4570
Assert.That(tokenLength, Is.EqualTo(expectedEncoded.Count));
@@ -53,7 +78,19 @@ public async Task TestEncodingAndDecodingInParallel()
5378
{
5479
var (encodingName, textToEncode, expectedEncoded) = _;
5580
var encoding = GptEncoding.GetEncoding(encodingName);
56-
var encoded = encoding.Encode(textToEncode);
81+
82+
// Detect if the text contains special tokens
83+
var allowedSpecial = new HashSet<string>();
84+
var specialTokens = GetSpecialTokensForEncoding(encodingName);
85+
foreach (var token in specialTokens)
86+
{
87+
if (textToEncode.Contains(token))
88+
{
89+
allowedSpecial.Add(token);
90+
}
91+
}
92+
93+
var encoded = encoding.Encode(textToEncode, allowedSpecial);
5794
var decodedText = encoding.Decode(encoded);
5895
return (textToEncode, encoded, expectedEncoded, decodedText);
5996
}));
@@ -162,6 +199,13 @@ static void TestModelPrefixMappingFailsAction()
162199
[TestCaseSource(nameof(ModelsList))]
163200
public async Task TestLocalResourceMatchesRemoteResource(string modelName)
164201
{
202+
// Skip o200k_harmony as it reuses o200k_base.tiktoken and doesn't have its own remote file
203+
if (modelName == "o200k_harmony")
204+
{
205+
Assert.Pass("o200k_harmony reuses o200k_base.tiktoken file and doesn't have its own remote file");
206+
return;
207+
}
208+
165209
var embeddedResourceName = $"SharpToken.data.{modelName}.tiktoken";
166210
var remoteResourceUrl = $"https://openaipublic.blob.core.windows.net/encodings/{modelName}.tiktoken";
167211

@@ -199,4 +243,63 @@ public void TestEncodingForModel()
199243
Assert.That(decodedText, Is.EqualTo(inputText));
200244
});
201245
}
246+
247+
[Test]
248+
public void TestO200KHarmonySpecialTokens()
249+
{
250+
var encoding = GptEncoding.GetEncoding("o200k_harmony");
251+
const string inputText = "Hello, world!";
252+
253+
// Test basic encoding/decoding
254+
var encoded = encoding.Encode(inputText);
255+
var decodedText = encoding.Decode(encoded);
256+
Assert.That(decodedText, Is.EqualTo(inputText));
257+
258+
// Test that o200k_harmony has more special tokens than o200k_base
259+
var baseEncoding = GptEncoding.GetEncoding("o200k_base");
260+
261+
// Test encoding with special tokens
262+
var textWithSpecialTokens = "Hello <|startoftext|> world <|call|> test <|reserved_200020|>";
263+
var encodedSpecial = encoding.Encode(textWithSpecialTokens, allowedSpecial: new HashSet<string> { "<|startoftext|>", "<|call|>", "<|reserved_200020|>" });
264+
var decodedSpecial = encoding.Decode(encodedSpecial);
265+
266+
Assert.That(decodedSpecial, Is.EqualTo(textWithSpecialTokens));
267+
268+
// Verify specific special token IDs
269+
Assert.That(encoding.Encode("<|startoftext|>", allowedSpecial: new HashSet<string> { "<|startoftext|>" }), Is.EqualTo(new List<int> { 199998 }));
270+
Assert.That(encoding.Encode("<|call|>", allowedSpecial: new HashSet<string> { "<|call|>" }), Is.EqualTo(new List<int> { 200012 }));
271+
Assert.That(encoding.Encode("<|reserved_200020|>", allowedSpecial: new HashSet<string> { "<|reserved_200020|>" }), Is.EqualTo(new List<int> { 200020 }));
272+
}
273+
274+
[Test]
275+
public void TestGPT5ModelMappings()
276+
{
277+
// Test that GPT-5 models map to the correct encodings
278+
Assert.That(Model.GetEncodingNameForModel("gpt-5"), Is.EqualTo("o200k_base"));
279+
Assert.That(Model.GetEncodingNameForModel("gpt-5-mini"), Is.EqualTo("o200k_base"));
280+
Assert.That(Model.GetEncodingNameForModel("gpt-5-nano"), Is.EqualTo("o200k_base"));
281+
Assert.That(Model.GetEncodingNameForModel("gpt-5-pro"), Is.EqualTo("o200k_base"));
282+
Assert.That(Model.GetEncodingNameForModel("gpt-5-thinking"), Is.EqualTo("o200k_base"));
283+
284+
// Test prefix matching for GPT-5 variants
285+
Assert.That(Model.GetEncodingNameForModel("gpt-5-2024-08-07"), Is.EqualTo("o200k_base"));
286+
Assert.That(Model.GetEncodingNameForModel("gpt-5-chat-latest"), Is.EqualTo("o200k_base"));
287+
}
288+
289+
private static HashSet<string> GetSpecialTokensForEncoding(string encodingName)
290+
{
291+
return encodingName switch
292+
{
293+
"r50k_base" or "p50k_base" => new HashSet<string> { "<|endoftext|>" },
294+
"p50k_edit" => new HashSet<string> { "<|endoftext|>", "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>" },
295+
"cl100k_base" => new HashSet<string> { "<|endoftext|>", "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", "<|endofprompt|>" },
296+
"o200k_base" => new HashSet<string> { "<|endoftext|>", "<|endofprompt|>" },
297+
"o200k_harmony" => new HashSet<string>
298+
{
299+
"<|endoftext|>", "<|endofprompt|>", "<|startoftext|>", "<|return|>", "<|constrain|>",
300+
"<|channel|>", "<|start|>", "<|end|>", "<|message|>", "<|call|>"
301+
}.Union(Enumerable.Range(200000, 1088).Select(i => $"<|reserved_{i}|>")).ToHashSet(),
302+
_ => new HashSet<string>()
303+
};
304+
}
202305
}

0 commit comments

Comments
 (0)