1111 FileReference ,
1212 CognitionIntegration ,
1313 IntegrationSharepoint ,
14- CognitionMarkdownDataset ,
15- CognitionMarkdownFile ,
1614)
1715
1816ETL_DIR = Path (os .getenv ("ETL_DIR" , "/app/data/etl" ))
@@ -24,14 +22,20 @@ def get_full_config_and_tokenizer_from_config_id(
2422 etl_config_id : Optional [str ] = None , # or in file_reference.meta_data
2523 content_type : Optional [str ] = None , # or in file_reference.content_type
2624 chunk_size : Optional [int ] = 1000 ,
25+ # only set for markdown datasets
26+ markdown_file_id : Optional [str ] = None , # or in file_reference.meta_data
2727 # only set for chat messages
28- project_id : Optional [str ] = None ,
29- conversation_id : Optional [str ] = None ,
28+ project_id : Optional [str ] = None , # or in file_reference.meta_data
29+ conversation_id : Optional [str ] = None , # or in file_reference.meta_data
3030) -> Tuple [Dict [str , Any ], str ]:
31+ for_dataset = False
3132 for_project = False
3233 if project_id and conversation_id :
3334 # project related load
3435 for_project = True
36+ elif markdown_file_id :
37+ # dataset related load
38+ for_dataset = True
3539
3640 etl_preset_item = etl_config_presets_db_co .get (
3741 etl_config_id or file_reference .meta_data .get ("etl_config_id" )
@@ -46,6 +50,11 @@ def get_full_config_and_tokenizer_from_config_id(
4650 "llmIdentifier" : llm_indicator_extract ,
4751 "overwriteVisionPrompt" : extraction_config .get ("overwriteVisionPrompt" ),
4852 }
53+ elif extraction_config .get ("azureDiApiBase" ):
54+ llm_config = {
55+ "azureDiApiBase" : extraction_config ["azureDiApiBase" ],
56+ "azureDiEnvVarId" : extraction_config ["azureDiEnvVarId" ],
57+ }
4958 full_config = [
5059 {
5160 "task_type" : enums .CognitionMarkdownFileState .EXTRACTING .value ,
@@ -56,7 +65,7 @@ def get_full_config_and_tokenizer_from_config_id(
5665 "minio_path" : file_reference .minio_path ,
5766 "fallback" : None , # later filled by config of project
5867 },
59- ** llm_config ,
68+ "llm_config" : llm_config ,
6069 },
6170 ]
6271
@@ -68,8 +77,9 @@ def get_full_config_and_tokenizer_from_config_id(
6877 "llmIdentifier" : transformation_config .get ("llmIdentifier" ),
6978 }
7079
80+ # splitting strategy "CHUNK" needs llm_config to execute `split_large_sections_via_llm`
7181 splitting_config = {
72- "llm_config" : transformation_llm_config , # splitting strategy "CHUNK" needs llm_config to execute `split_large_sections_via_llm`
82+ "llm_config" : transformation_llm_config ,
7383 "task_type" : enums .CognitionMarkdownFileState .SPLITTING .value ,
7484 "task_config" : {
7585 "use_cache" : True ,
@@ -79,8 +89,6 @@ def get_full_config_and_tokenizer_from_config_id(
7989 }
8090
8191 if transformation_type == "COMMON_ETL" :
82- # add default splitting for common etl
83-
8492 full_config .append (splitting_config )
8593 transformers = [
8694 { # NOTE: __call_gpt_with_key only reads user_prompt
@@ -119,6 +127,7 @@ def get_full_config_and_tokenizer_from_config_id(
119127 },
120128 }
121129 )
130+
122131 if for_project :
123132 full_config .append (
124133 {
@@ -139,109 +148,24 @@ def get_full_config_and_tokenizer_from_config_id(
139148 },
140149 },
141150 )
142- else :
151+ elif for_dataset :
143152 full_config .append (
144153 {
145154 "task_type" : enums .CognitionMarkdownFileState .LOADING .value ,
146155 "task_config" : {
147156 "markdown_file" : {
148157 "enabled" : True ,
149- "id" : file_reference .meta_data ["markdown_file_id" ],
158+ "id" : (
159+ markdown_file_id
160+ or file_reference .meta_data ["markdown_file_id" ]
161+ ),
150162 }
151163 },
152164 },
153165 )
154166 return full_config , etl_preset_item .etl_config .get ("tokenizer" )
155167
156168
157- def get_full_config_for_markdown_file (
158- file_reference : FileReference ,
159- markdown_dataset : CognitionMarkdownDataset ,
160- markdown_file : CognitionMarkdownFile ,
161- chunk_size : Optional [int ] = 1000 ,
162- ) -> List [Dict [str , Any ]]:
163- extraction_llm_config , transformation_llm_config = __get_llm_config_from_dataset (
164- markdown_dataset
165- )
166- extractor = markdown_file .meta_data .get ("extractor" )
167- if extractor is None :
168- print (
169- f"WARNING: { __name__ } - no extractor found in markdown_file meta_data for { file_reference .original_file_name } , will infer default"
170- )
171-
172- full_config = [
173- {
174- "llm_config" : extraction_llm_config ,
175- "task_type" : enums .CognitionMarkdownFileState .EXTRACTING .value ,
176- "task_config" : {
177- "use_cache" : True ,
178- "extractor" : extractor ,
179- "minio_path" : file_reference .minio_path ,
180- "fallback" : None , # later filled by config of project
181- },
182- },
183- {
184- "llm_config" : extraction_llm_config ,
185- "task_type" : enums .CognitionMarkdownFileState .SPLITTING .value ,
186- "task_config" : {
187- "use_cache" : True ,
188- "strategy" : enums .ETLSplitStrategy .CHUNK .value ,
189- "chunk_size" : chunk_size ,
190- },
191- },
192- {
193- "llm_config" : transformation_llm_config ,
194- "task_type" : enums .CognitionMarkdownFileState .TRANSFORMING .value ,
195- "task_config" : {
196- "use_cache" : True ,
197- "transformers" : [
198- { # NOTE: __call_gpt_with_key only reads user_prompt
199- "enabled" : False ,
200- "name" : enums .ETLTransformer .CLEANSE .value ,
201- "system_prompt" : None ,
202- "user_prompt" : None ,
203- },
204- {
205- "enabled" : True ,
206- "name" : enums .ETLTransformer .TEXT_TO_TABLE .value ,
207- "system_prompt" : None ,
208- "user_prompt" : None ,
209- },
210- {
211- "enabled" : False ,
212- "name" : enums .ETLTransformer .SUMMARIZE .value ,
213- "system_prompt" : None ,
214- "user_prompt" : None ,
215- },
216- ],
217- },
218- },
219- {
220- "task_type" : enums .CognitionMarkdownFileState .LOADING .value ,
221- "task_config" : {
222- "markdown_file" : {
223- "enabled" : True ,
224- "id" : str (markdown_file .id ),
225- },
226- },
227- },
228- ]
229- return full_config
230-
231-
232- def __get_llm_config_from_dataset (
233- markdown_dataset : CognitionMarkdownDataset ,
234- ) -> Tuple [Dict [str , Any ], str ]:
235- extraction_llm_config = markdown_dataset .llm_config .get ("extraction" , {})
236- transformation_llm_config = markdown_dataset .llm_config .get ("transformation" , {})
237- if not extraction_llm_config or not transformation_llm_config :
238- raise ValueError (
239- f"Dataset with id { markdown_dataset .id } has incomplete llm_config"
240- )
241-
242- return extraction_llm_config , transformation_llm_config
243-
244-
245169def get_full_config_for_integration (
246170 integration : CognitionIntegration ,
247171 record : IntegrationSharepoint ,
@@ -385,6 +309,9 @@ def rm_tree(path: Path):
385309 rm_tree (etl_cache_dir )
386310
387311
312+ # TODO: delete_etl_tasks for related file_reference_id
313+
314+
388315def get_download_key (org_id : str , download_id : str ) -> Path :
389316 return Path (org_id ) / download_id / "download"
390317
@@ -490,10 +417,16 @@ def get_transformation_key(
490417 return transformation_key
491418
492419
493- def get_hashed_string (* args , delimiter : str = "_" ) -> str :
494- hash_string = delimiter .join (map (str , args ))
495- hasher = hashlib .new ("sha256" )
496- hasher .update (hash_string .encode ())
420+ def get_hashed_string (* args , delimiter : str = "_" , from_bytes : bool = False ) -> str :
421+ if not from_bytes :
422+ _hash = delimiter .join (map (str , args )).encode ()
423+ else :
424+ try :
425+ _hash = next (map (bytes , args ))
426+ except StopIteration :
427+ raise ValueError ("ERROR: A 'bytes' argument is required to hash" )
428+
429+ hasher = hashlib .sha256 (_hash )
497430 return hasher .hexdigest ()
498431
499432
0 commit comments