Skip to content

Commit 35e7718

Browse files
committed
Match API | Added logs with CID | also added max-length on cross encoder
1 parent db324df commit 35e7718

File tree

4 files changed

+11
-7
lines changed

4 files changed

+11
-7
lines changed

core/common/search.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import time
33
import urllib
44

5+
from cid.locals import get_cid
56
from django.db.models import Case, When, IntegerField
67
from elasticsearch_dsl import FacetedSearch, Q
78
from pydash import compact, get
@@ -212,9 +213,10 @@ def to_queryset(self, keep_order=True, normalized_score=False, exact_count=True,
212213
encoder = bool(txt)
213214
s, hits, total = self.__get_response(exact_count, encoder)
214215
max_score = hits.max_score or 1
216+
cid = get_cid()
215217
start_time = time.time()
216218
hits = get_cross_encoder(txt, hits.hits, encoder_model) if encoder else hits.hits
217-
print(f"Cross encoder time: {time.time() - start_time}s")
219+
print(f"[{cid}] Cross encoder time: {time.time() - start_time} seconds")
218220
for result in hits:
219221
_id = get(result, '_id')
220222
rerank_score = get(result, '_rerank_score')

core/common/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -962,7 +962,7 @@ def get_embeddings(txt):
962962

963963
def get_encoder(model):
964964
if model in ENCODERS:
965-
return CrossEncoder(model, device="cpu")
965+
return CrossEncoder(model, device="cpu", max_length=128)
966966
return settings.ENCODER
967967

968968

core/concepts/views.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import time
22

3+
from cid.locals import get_cid
34
from django.conf import settings
45
from django.db.models import F
56
from django.http import Http404
@@ -831,20 +832,21 @@ def filter_queryset(self, _=None): # pylint: disable=too-many-locals,too-many-s
831832
reranker = self.request.GET.get('reranker', None) in get_truthy_values() # enables reranker
832833
reranker = reranker and self.request.user.is_mapper_cross_encoder_group
833834
score_to_sort = 'search_rerank_score' if reranker else 'search_normalized_score'
835+
cid = get_cid()
834836
results = []
835837
for row in rows:
836838
start_time = time.time()
837839
search = ConceptFuzzySearch.search(
838840
row, target_repo_url, repo_params, include_retired,
839841
is_semantic, num_candidates, k_nearest, map_config, faceted_criterion, locale_filter
840842
)
841-
print(f"ES Search built in {time.time() - start_time} seconds")
843+
print(f"[{cid}] ES Search built in {time.time() - start_time} seconds")
842844
start_time = time.time()
843845
search = search.params(track_total_hits=False, request_cache=True)
844846
es_search = CustomESSearch(search[start:end], ConceptDocument)
845847
name = row.get('name') or row.get('Name') if reranker else None
846848
es_search.to_queryset(False, True, False, name, encoder_model)
847-
print(f"ES Search (including reranker) executed in {time.time() - start_time} seconds")
849+
print(f"[{cid}] ES Search (including reranker) executed in {time.time() - start_time} seconds")
848850
start_time = time.time()
849851
result = {'row': row, 'results': [], 'map_config': map_config, 'filter': filters}
850852
for concept in es_search.queryset:
@@ -859,12 +861,12 @@ def filter_queryset(self, _=None): # pylint: disable=too-many-locals,too-many-s
859861
data = serializer(concept, context={'request': self.request}).data
860862
data['search_meta']['search_normalized_score'] = normalized_score * 100
861863
result['results'].append(data)
862-
print(f"Concepts serialized in {time.time() - start_time} seconds")
864+
print(f"[{cid}] Concepts serialized in {time.time() - start_time} seconds")
863865
start_time = time.time()
864866
if 'results' in result:
865867
result['results'] = sorted(
866868
result['results'], key=lambda res: get(res, f'search_meta.{score_to_sort}'), reverse=True)
867-
print(f"Concepts sorted in {time.time() - start_time} seconds")
869+
print(f"[{cid}] Concepts sorted in {time.time() - start_time} seconds")
868870
results.append(result)
869871

870872
return results

core/settings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -619,7 +619,7 @@
619619
LM_MODEL_NAME = 'all-MiniLM-L6-v2'
620620
LM = SentenceTransformer(LM_MODEL_NAME)
621621
if ENV not in ['qa']:
622-
ENCODER = CrossEncoder("BAAI/bge-reranker-v2-m3", device="cpu")
622+
ENCODER = CrossEncoder("BAAI/bge-reranker-v2-m3", device="cpu", max_length=128)
623623

624624
ANALYTICS_API = os.environ.get('ANALYTICS_API', 'http://host.docker.internal:8002')
625625
if ANALYTICS_API:

0 commit comments

Comments
 (0)