22
33namespace dokuwiki \plugin \aichat ;
44
5+ use dokuwiki \Extension \Event ;
56use dokuwiki \Extension \PluginInterface ;
67use dokuwiki \plugin \aichat \Model \ChatInterface ;
78use dokuwiki \plugin \aichat \Model \EmbeddingInterface ;
@@ -55,17 +56,18 @@ class Embeddings
5556 * @param array $config The plugin configuration
5657 */
5758 public function __construct (
58- ChatInterface $ chatModel ,
59+ ChatInterface $ chatModel ,
5960 EmbeddingInterface $ embedModel ,
60- AbstractStorage $ storage ,
61- $ config
62- ) {
61+ AbstractStorage $ storage ,
62+ $ config
63+ )
64+ {
6365 $ this ->chatModel = $ chatModel ;
6466 $ this ->embedModel = $ embedModel ;
6567 $ this ->storage = $ storage ;
6668 $ this ->configChunkSize = $ config ['chunkSize ' ];
6769 $ this ->configContextChunks = $ config ['contextChunks ' ];
68- $ this ->similarityThreshold = $ config ['similarityThreshold ' ]/ 100 ;
70+ $ this ->similarityThreshold = $ config ['similarityThreshold ' ] / 100 ;
6971 }
7072
7173 /**
@@ -169,9 +171,10 @@ public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
169171 * @param string $page Name of the page to split
170172 * @param int $firstChunkID The ID of the first chunk of this page
171173 * @return Chunk[] A list of chunks created for this page
174+ * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page
172175 * @throws \Exception
173176 */
174- protected function createPageChunks ($ page , $ firstChunkID )
177+ public function createPageChunks ($ page , $ firstChunkID )
175178 {
176179 $ chunkList = [];
177180
@@ -184,6 +187,19 @@ protected function createPageChunks($page, $firstChunkID)
184187 $ text = rawWiki ($ page );
185188 }
186189
190+ // allow plugins to modify the text before splitting
191+ $ eventData = [
192+ 'page ' => $ page ,
193+ 'body ' => '' ,
194+ 'metadata ' => ['title ' => $ page , 'relation_references ' => []],
195+ ];
196+ $ event = new Event ('INDEXER_PAGE_ADD ' , $ eventData );
197+ if ($ event ->advise_before ()) {
198+ $ text = $ eventData ['body ' ] . ' ' . $ text ;
199+ } else {
200+ $ text = $ eventData ['body ' ];
201+ }
202+
187203 $ parts = $ this ->splitIntoChunks ($ text );
188204 foreach ($ parts as $ part ) {
189205 if (trim ((string )$ part ) == '' ) continue ; // skip empty chunks
@@ -251,7 +267,7 @@ public function getSimilarChunks($query, $lang = '')
251267 foreach ($ chunks as $ chunk ) {
252268 // filter out chunks the user is not allowed to read
253269 if ($ auth && auth_quickaclcheck ($ chunk ->getPage ()) < AUTH_READ ) continue ;
254- if ($ chunk ->getScore () < $ this ->similarityThreshold ) continue ;
270+ if ($ chunk ->getScore () < $ this ->similarityThreshold ) continue ;
255271
256272 $ chunkSize = count ($ this ->getTokenEncoder ()->encode ($ chunk ->getText ()));
257273 if ($ size + $ chunkSize > $ this ->chatModel ->getMaxInputTokenLength ()) break ; // we have enough
@@ -269,7 +285,7 @@ public function getSimilarChunks($query, $lang = '')
269285 * @throws \Exception
270286 * @todo support splitting too long sentences
271287 */
272- public function splitIntoChunks ($ text )
288+ protected function splitIntoChunks ($ text )
273289 {
274290 $ sentenceSplitter = new Sentence ();
275291 $ tiktok = $ this ->getTokenEncoder ();
@@ -297,7 +313,8 @@ public function splitIntoChunks($text)
297313 $ this ->rememberSentence ($ sentence );
298314 } else {
299315 // add current chunk to result
300- $ chunks [] = $ chunk ;
316+ $ chunk = trim ($ chunk );
317+ if ($ chunk !== '' ) $ chunks [] = $ chunk ;
301318
302319 // start new chunk with remembered sentences
303320 $ chunk = implode (' ' , $ this ->sentenceQueue );
0 commit comments