|
2 | 2 |
|
3 | 3 | from __future__ import annotations |
4 | 4 |
|
| 5 | +import itertools |
5 | 6 | import json |
6 | 7 | import logging |
| 8 | +from collections import defaultdict |
7 | 9 |
|
8 | 10 | from openai import OpenAI |
9 | 11 |
|
@@ -79,7 +81,7 @@ def analyze(self, articles: list[Article], question: str) -> str: |
79 | 81 | content = response.choices[0].message.content |
80 | 82 | if content is None: |
81 | 83 | raise AnalysisError("OpenAI returned empty response") |
82 | | - answer = content.strip() |
| 84 | + answer: str = content.strip() |
83 | 85 | logger.info("Successfully received analysis from OpenAI") |
84 | 86 | return answer |
85 | 87 | except Exception as e: |
@@ -130,3 +132,22 @@ def find_duplicate_titles(articles: list[Article]) -> list[tuple[Article, Articl |
130 | 132 | if articles[i].title == articles[j].title: |
131 | 133 | duplicates.append((articles[i], articles[j])) |
132 | 134 | return duplicates |
| 135 | + |
| 136 | + |
| 137 | +def find_duplicate_titles_improved( |
| 138 | + articles: list[Article], |
| 139 | +) -> list[tuple[Article, Article]]: |
| 140 | + """Find articles with duplicate titles using efficient dictionary-based approach.""" |
| 141 | + |
| 142 | + title_to_articles = defaultdict(list) |
| 143 | + for article in articles: |
| 144 | + title_to_articles[article.title].append(article) |
| 145 | + |
| 146 | + duplicates = [] |
| 147 | + for articles_with_same_title in title_to_articles.values(): |
| 148 | + if len(articles_with_same_title) > 1: |
| 149 | + # Generate all unique pairs for this title |
| 150 | + for pair in itertools.combinations(articles_with_same_title, 2): |
| 151 | + duplicates.append(pair) |
| 152 | + |
| 153 | + return duplicates |
0 commit comments