Skip to content

Commit ab1f8dd

Browse files
committed
emit the INDEXER_PAGE_ADD event
This allows plugins that add data to the fulltext index to add the same data to the embeddings. This improves embedding searches with struct data for example.
1 parent 720bb43 commit ab1f8dd

File tree

4 files changed

+39
-17
lines changed

4 files changed

+39
-17
lines changed

Embeddings.php

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
namespace dokuwiki\plugin\aichat;
44

5+
use dokuwiki\Extension\Event;
56
use dokuwiki\Extension\PluginInterface;
67
use dokuwiki\plugin\aichat\Model\ChatInterface;
78
use dokuwiki\plugin\aichat\Model\EmbeddingInterface;
@@ -55,17 +56,18 @@ class Embeddings
5556
* @param array $config The plugin configuration
5657
*/
5758
public function __construct(
58-
ChatInterface $chatModel,
59+
ChatInterface $chatModel,
5960
EmbeddingInterface $embedModel,
60-
AbstractStorage $storage,
61-
$config
62-
) {
61+
AbstractStorage $storage,
62+
$config
63+
)
64+
{
6365
$this->chatModel = $chatModel;
6466
$this->embedModel = $embedModel;
6567
$this->storage = $storage;
6668
$this->configChunkSize = $config['chunkSize'];
6769
$this->configContextChunks = $config['contextChunks'];
68-
$this->similarityThreshold = $config['similarityThreshold']/100;
70+
$this->similarityThreshold = $config['similarityThreshold'] / 100;
6971
}
7072

7173
/**
@@ -169,9 +171,10 @@ public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
169171
* @param string $page Name of the page to split
170172
* @param int $firstChunkID The ID of the first chunk of this page
171173
* @return Chunk[] A list of chunks created for this page
174+
* @emits INDEXER_PAGE_ADD support plugins that add additional data to the page
172175
* @throws \Exception
173176
*/
174-
protected function createPageChunks($page, $firstChunkID)
177+
public function createPageChunks($page, $firstChunkID)
175178
{
176179
$chunkList = [];
177180

@@ -184,6 +187,19 @@ protected function createPageChunks($page, $firstChunkID)
184187
$text = rawWiki($page);
185188
}
186189

190+
// allow plugins to modify the text before splitting
191+
$eventData = [
192+
'page' => $page,
193+
'body' => '',
194+
'metadata' => ['title' => $page, 'relation_references' => []],
195+
];
196+
$event = new Event('INDEXER_PAGE_ADD', $eventData);
197+
if ($event->advise_before()) {
198+
$text = $eventData['body'] . ' ' . $text;
199+
} else {
200+
$text = $eventData['body'];
201+
}
202+
187203
$parts = $this->splitIntoChunks($text);
188204
foreach ($parts as $part) {
189205
if (trim((string)$part) == '') continue; // skip empty chunks
@@ -251,7 +267,7 @@ public function getSimilarChunks($query, $lang = '')
251267
foreach ($chunks as $chunk) {
252268
// filter out chunks the user is not allowed to read
253269
if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
254-
if($chunk->getScore() < $this->similarityThreshold) continue;
270+
if ($chunk->getScore() < $this->similarityThreshold) continue;
255271

256272
$chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
257273
if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough
@@ -269,7 +285,7 @@ public function getSimilarChunks($query, $lang = '')
269285
* @throws \Exception
270286
* @todo support splitting too long sentences
271287
*/
272-
public function splitIntoChunks($text)
288+
protected function splitIntoChunks($text)
273289
{
274290
$sentenceSplitter = new Sentence();
275291
$tiktok = $this->getTokenEncoder();
@@ -297,7 +313,8 @@ public function splitIntoChunks($text)
297313
$this->rememberSentence($sentence);
298314
} else {
299315
// add current chunk to result
300-
$chunks[] = $chunk;
316+
$chunk = trim($chunk);
317+
if ($chunk !== '') $chunks[] = $chunk;
301318

302319
// start new chunk with remembered sentences
303320
$chunk = implode(' ', $this->sentenceQueue);

Storage/SQLiteStorage.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ public function startCreation($clear = false)
6565
if ($clear) {
6666
/** @noinspection SqlWithoutWhere */
6767
$this->db->exec('DELETE FROM embeddings');
68+
/** @noinspection SqlWithoutWhere */
69+
$this->db->exec('DELETE FROM clusters');
6870
}
6971
}
7072

cli.php

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -210,10 +210,9 @@ protected function page($page, $dump = false)
210210
*/
211211
protected function split($page)
212212
{
213-
$text = rawWiki($page);
214-
$chunks = $this->helper->getEmbeddings()->splitIntoChunks($text);
213+
$chunks = $this->helper->getEmbeddings()->createPageChunks($page, 0);
215214
foreach ($chunks as $chunk) {
216-
echo $chunk;
215+
echo $chunk->getText();
217216
echo "\n";
218217
$this->colors->ptln('--------------------------------', Colors::C_LIGHTPURPLE);
219218
}

cli/simulate.php

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
<?php
22

3-
use dokuwiki\Extension\CLIPlugin;
43
use dokuwiki\plugin\aichat\AbstractCLI;
5-
use dokuwiki\plugin\aichat\ModelFactory;
64
use splitbrain\phpcli\Colors;
75
use splitbrain\phpcli\Options;
86

@@ -76,9 +74,15 @@ protected function simulate($questions, $model)
7674
$this->helper->getEmbeddingModel()->resetUsageStats();
7775

7876
$this->colors->ptln($q, Colors::C_LIGHTPURPLE);
79-
$result = $this->helper->askChatQuestion($q, $history);
80-
$history[] = [$result['question'], $result['answer']];
81-
$this->colors->ptln($result['question'], Colors::C_LIGHTBLUE);
77+
try {
78+
$result = $this->helper->askChatQuestion($q, $history);
79+
$history[] = [$result['question'], $result['answer']];
80+
$this->colors->ptln($result['question'], Colors::C_LIGHTBLUE);
81+
} catch (Exception $e) {
82+
$this->error($e->getMessage());
83+
$this->debug($e->getTraceAsString());
84+
$result = ['question' => $q, 'answer' => "ERROR\n" . $e->getMessage(), 'sources' => []];
85+
}
8286

8387
$record = [
8488
'question' => $q,

0 commit comments

Comments
 (0)