Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/372.fixed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Refactored OpenAI parser to be used as a fallback and fixed subject injection to skip text/calendar parts.
24 changes: 17 additions & 7 deletions circuit_maintenance_parser/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,6 @@ def get_maintenances(self, data: NotificationData) -> Iterable[Maintenance]:
logger.debug("Skipping notification %s due filtering policy for %s.", data, self.__class__.__name__)
return []

if os.getenv("PARSER_OPENAI_API_KEY"):
self._processors.append(CombinedProcessor(data_parsers=[EmailDateParser, OpenAIParser]))

# Add subject to all html or text/* data_parts if not already present.
self.add_subject_to_text(data)

for processor in self._processors:
try:
return processor.process(data, self.get_extended_data())
Expand All @@ -150,6 +144,22 @@ def get_maintenances(self, data: NotificationData) -> Iterable[Maintenance]:
related_exceptions.append(exc)
continue

# Use OpenAI parser as a last resort if all other processors failed.
if os.getenv("PARSER_OPENAI_API_KEY"):
self.add_subject_to_text(data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still feels like modifying data in place may lead to bugs in the future. Might be better to change add_subject_to_text(data) so that it returns a modified copy instead of modifying the original in place?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should be good here, this is the last point in the path. Everything should be sent in and should be kept outside of the parsers.

openai_processor = CombinedProcessor(data_parsers=[EmailDateParser, OpenAIParser])
try:
return openai_processor.process(data, self.get_extended_data())
except ProcessorError as exc:
process_error_message = (
f"- Processor {openai_processor.__class__.__name__} from {provider_name} failed due to: %s\n"
)
logger.debug(process_error_message, traceback.format_exc())

related_exc = rgetattr(exc, "__cause__")
error_message += process_error_message % related_exc
related_exceptions.append(exc)

raise ProviderError(
(f"Failed creating Maintenance notification for {provider_name}.\nDetails:\n{error_message}"),
related_exceptions=related_exceptions,
Expand All @@ -165,7 +175,7 @@ def add_subject_to_text(self, data: NotificationData):
if subject:
new_data_parts = []
for part in data.data_parts:
if part.type.startswith("text/") or part.type.startswith("html"):
if (part.type.startswith("text/") or part.type.startswith("html")) and part.type != "text/calendar":
content_str = part.content.decode(errors="ignore")
if subject not in content_str:
# Append subject and update content
Expand Down
99 changes: 89 additions & 10 deletions tests/unit/test_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@

from circuit_maintenance_parser.data import NotificationData
from circuit_maintenance_parser.errors import ProcessorError, ProviderError
from circuit_maintenance_parser.parser import EmailDateParser, Parser
from circuit_maintenance_parser.parsers.openai import OpenAIParser
from circuit_maintenance_parser.processor import CombinedProcessor, SimpleProcessor
from circuit_maintenance_parser.parser import Parser
from circuit_maintenance_parser.processor import SimpleProcessor
from circuit_maintenance_parser.provider import AquaComms, GenericProvider

# pylint: disable=use-implicit-booleaness-not-comparison
Expand Down Expand Up @@ -116,21 +115,77 @@ class ProviderWithIncludeFilter(GenericProvider):
"provider_class",
[GenericProvider, AquaComms],
)
def test_provider_gets_mlparser(provider_class):
"""Test to check the any provider gets a default ML parser when ENV is activated."""
def test_provider_falls_back_to_openai(provider_class):
"""Test that OpenAI parser is used as last resort when all other processors fail."""
with patch.dict(os.environ, {"PARSER_OPENAI_API_KEY": "some_api_key"}):
data = NotificationData.init_from_raw("text/plain", b"fake data")
data.add_data_part("text/html", b"other data")

provider = provider_class()
original_processor_count = len(provider._processors) # pylint: disable=protected-access

with patch("circuit_maintenance_parser.processor.GenericProcessor.process") as mock_processor:
# All native processors fail, then OpenAI processor succeeds
mock_processor.side_effect = [ProcessorError] * original_processor_count + [[{"a": "b"}]]
provider.get_maintenances(data)
# Native processors + 1 OpenAI call
assert mock_processor.call_count == original_processor_count + 1

# OpenAI processor should NOT be appended to the provider's processor list
assert len(provider._processors) == original_processor_count # pylint: disable=protected-access


def test_provider_does_not_use_openai_when_native_succeeds():
"""Test that OpenAI parser is not invoked when a native processor succeeds."""
os.environ["PARSER_OPENAI_API_KEY"] = "some_api_key"
data = NotificationData.init_from_raw("text/plain", b"fake data")
data.add_data_part("text/html", b"other data")

provider = provider_class()
provider = GenericProvider()

with patch("circuit_maintenance_parser.processor.GenericProcessor.process") as mock_processor:
mock_processor.return_value = [{"a": "b"}]
provider.get_maintenances(data)
# Only the native processor should be called
assert mock_processor.call_count == 1

assert provider._processors[-1] == CombinedProcessor( # pylint: disable=protected-access
data_parsers=[EmailDateParser, OpenAIParser]
)
os.environ.pop("PARSER_OPENAI_API_KEY", None)


def test_provider_data_not_mutated_when_native_succeeds():
"""Test that add_subject_to_text is not called when native processors succeed."""
os.environ["PARSER_OPENAI_API_KEY"] = "some_api_key"
data = NotificationData.init_from_raw("text/plain", b"fake data")
data.add_data_part("email-header-subject", b"Test Subject")

original_content = data.data_parts[0].content
provider = GenericProvider()

with patch("circuit_maintenance_parser.processor.GenericProcessor.process") as mock_processor:
mock_processor.return_value = [{"a": "b"}]
provider.get_maintenances(data)

# Data should not have been mutated since native processor succeeded
assert data.data_parts[0].content == original_content
os.environ.pop("PARSER_OPENAI_API_KEY", None)


def test_provider_no_repeated_append_on_multiple_calls():
"""Test that calling get_maintenances multiple times doesn't grow the processor list."""
os.environ["PARSER_OPENAI_API_KEY"] = "some_api_key"
data = NotificationData.init_from_raw("text/plain", b"fake data")

provider = GenericProvider()
original_processor_count = len(provider._processors) # pylint: disable=protected-access

with patch("circuit_maintenance_parser.processor.GenericProcessor.process") as mock_processor:
mock_processor.return_value = [{"a": "b"}]
provider.get_maintenances(data)
provider.get_maintenances(data)
provider.get_maintenances(data)

# Processor list should never grow
assert len(provider._processors) == original_processor_count # pylint: disable=protected-access
os.environ.pop("PARSER_OPENAI_API_KEY", None)


def test_add_subject_to_text_appends_subject_to_text_parts():
Expand Down Expand Up @@ -230,3 +285,27 @@ def test_add_subject_to_text_handles_decode_errors():

# This should not raise an exception due to errors="ignore" in decode()
provider.add_subject_to_text(data)


def test_add_subject_to_text_skips_text_calendar():
"""Test that add_subject_to_text does not modify text/calendar parts."""
provider = GenericProvider()

data = NotificationData()
data.add_data_part("email-header-subject", b"Planned Work Notification")
data.add_data_part(
"text/calendar",
b"BEGIN:VCALENDAR\r\nVERSION:2.0\r\nBEGIN:VEVENT\r\nEND:VEVENT\r\nEND:VCALENDAR",
)
data.add_data_part("text/plain", b"Some plain text content")

original_calendar_content = data.data_parts[1].content

provider.add_subject_to_text(data)

# text/calendar part should remain unchanged
assert data.data_parts[1].content == original_calendar_content
assert b"Planned Work Notification" not in data.data_parts[1].content

# text/plain part should have subject appended
assert b"Planned Work Notification" in data.data_parts[2].content