|
1 | 1 | import os |
2 | 2 | from fnmatch import fnmatch |
3 | 3 | from typing import Dict, List, Union |
4 | | -import asyncio |
| 4 | +import tiktoken |
5 | 5 |
|
6 | | -from tokencost import count_string_tokens |
7 | | -from gitingest.parse_query import parse_query |
8 | 6 |
|
9 | 7 | MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB |
10 | 8 | MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal |
@@ -267,18 +265,21 @@ def create_tree_structure(query: dict, node: Dict, prefix: str = "", is_last: bo |
267 | 265 | return tree |
268 | 266 |
|
269 | 267 | def generate_token_string(context_string: str) -> str: |
| 268 | + """Returns the number of tokens in a text string.""" |
270 | 269 | formatted_tokens = "" |
271 | 270 | try: |
272 | | - total_gpt_tokens = count_string_tokens(prompt=context_string, model="gpt-4o") |
| 271 | + encoding = tiktoken.get_encoding("cl100k_base", ) |
| 272 | + total_tokens = len(encoding.encode(context_string, disallowed_special=())) |
| 273 | + |
273 | 274 | except Exception as e: |
274 | 275 | print(e) |
275 | 276 | return None |
276 | | - if total_gpt_tokens > 1000000: |
277 | | - formatted_tokens = f"{total_gpt_tokens/1000000:.1f}M" |
278 | | - elif total_gpt_tokens > 1000: |
279 | | - formatted_tokens = f"{total_gpt_tokens/1000:.1f}k" |
| 277 | + if total_tokens > 1000000: |
| 278 | + formatted_tokens = f"{total_tokens/1000000:.1f}M" |
| 279 | + elif total_tokens > 1000: |
| 280 | + formatted_tokens = f"{total_tokens/1000:.1f}k" |
280 | 281 | else: |
281 | | - formatted_tokens = f"{total_gpt_tokens}" |
| 282 | + formatted_tokens = f"{total_tokens}" |
282 | 283 | return formatted_tokens |
283 | 284 |
|
284 | 285 | def ingest_single_file(path: str, query: dict) -> Dict: |
|
0 commit comments