Export NLP community reports prompt (microsoft#1697)

natoverse · web-flow · commit fe461417b5c1 · 2025-02-12T10:41:39.000-08:00
* Properly export the NLP community reports prompt

* Semver

* Fix verb tests
diff --git a/.semversioner/next-release/patch-20250212004406773499.json b/.semversioner/next-release/patch-20250212004406773499.json
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Export NLP community reports prompt."
+}
diff --git a/graphrag/cli/initialize.py b/graphrag/cli/initialize.py
@@ -10,6 +10,9 @@
 from graphrag.prompts.index.community_report import (
     COMMUNITY_REPORT_PROMPT,
 )
+from graphrag.prompts.index.community_report_text_units import (
+    COMMUNITY_REPORT_TEXT_PROMPT,
+)
 from graphrag.prompts.index.extract_claims import EXTRACT_CLAIMS_PROMPT
 from graphrag.prompts.index.extract_graph import GRAPH_EXTRACTION_PROMPT
 from graphrag.prompts.index.summarize_descriptions import SUMMARIZE_PROMPT
@@ -72,7 +75,8 @@ def initialize_project_at(path: Path, force: bool) -> None:
         "extract_graph": GRAPH_EXTRACTION_PROMPT,
         "summarize_descriptions": SUMMARIZE_PROMPT,
         "extract_claims": EXTRACT_CLAIMS_PROMPT,
-        "community_report": COMMUNITY_REPORT_PROMPT,
+        "community_report_graph": COMMUNITY_REPORT_PROMPT,
+        "community_report_text": COMMUNITY_REPORT_TEXT_PROMPT,
         "drift_search_system_prompt": DRIFT_LOCAL_SYSTEM_PROMPT,
         "drift_reduce_prompt": DRIFT_REDUCE_PROMPT,
         "global_search_map_system_prompt": MAP_SYSTEM_PROMPT,
diff --git a/graphrag/config/init_content.py b/graphrag/config/init_content.py
@@ -114,7 +114,8 @@
 
 community_reports:
   model_id: {defs.COMMUNITY_REPORT_MODEL_ID}
-  prompt: "prompts/community_report.txt"
+  graph_prompt: "prompts/community_report_graph.txt"
+  text_prompt: "prompts/community_report_text.txt"
   max_length: {defs.COMMUNITY_REPORT_MAX_LENGTH}
   max_input_length: {defs.COMMUNITY_REPORT_MAX_INPUT_LENGTH}
 
diff --git a/graphrag/config/models/community_reports_config.py b/graphrag/config/models/community_reports_config.py
@@ -14,8 +14,13 @@
 class CommunityReportsConfig(BaseModel):
     """Configuration section for community reports."""
 
-    prompt: str | None = Field(
-        description="The community report extraction prompt to use.", default=None
+    graph_prompt: str | None = Field(
+        description="The community report extraction prompt to use for graph-based summarization.",
+        default=None,
+    )
+    text_prompt: str | None = Field(
+        description="The community report extraction prompt to use for text-based summarization.",
+        default=None,
     )
     max_length: int = Field(
         description="The community report maximum length in tokens.",
@@ -46,10 +51,15 @@ def resolved_strategy(
             "llm": model_config.model_dump(),
             "stagger": model_config.parallelization_stagger,
             "num_threads": model_config.parallelization_num_threads,
-            "extraction_prompt": (Path(root_dir) / self.prompt).read_text(
+            "graph_prompt": (Path(root_dir) / self.graph_prompt).read_text(
+                encoding="utf-8"
+            )
+            if self.graph_prompt
+            else None,
+            "text_prompt": (Path(root_dir) / self.text_prompt).read_text(
                 encoding="utf-8"
             )
-            if self.prompt
+            if self.text_prompt
             else None,
             "max_report_length": self.max_length,
             "max_input_length": self.max_input_length,
diff --git a/graphrag/index/flows/create_community_reports.py b/graphrag/index/flows/create_community_reports.py
@@ -46,6 +46,8 @@ async def create_community_reports(
     if claims_input is not None:
         claims = _prep_claims(claims_input)
 
+    summarization_strategy["extraction_prompt"] = summarization_strategy["graph_prompt"]
+
     max_input_length = summarization_strategy.get(
         "max_input_length", defaults.COMMUNITY_REPORT_MAX_INPUT_LENGTH
     )
diff --git a/graphrag/index/flows/create_community_reports_text.py b/graphrag/index/flows/create_community_reports_text.py
@@ -24,9 +24,6 @@
     build_level_context,
     build_local_context,
 )
-from graphrag.prompts.index.community_report_text_units import (
-    COMMUNITY_REPORT_PROMPT,
-)
 
 log = logging.getLogger(__name__)
 
@@ -44,8 +41,7 @@ async def create_community_reports_text(
     """All the steps to transform community reports."""
     nodes = explode_communities(communities, entities)
 
-    # TEMP: forcing override of the prompt until we can put it into config
-    summarization_strategy["extraction_prompt"] = COMMUNITY_REPORT_PROMPT
+    summarization_strategy["extraction_prompt"] = summarization_strategy["text_prompt"]
 
     max_input_length = summarization_strategy.get(
         "max_input_length", defaults.COMMUNITY_REPORT_MAX_INPUT_LENGTH
diff --git a/graphrag/prompts/index/community_report_text_units.py b/graphrag/prompts/index/community_report_text_units.py
@@ -3,7 +3,7 @@
 
 """A file containing prompts definition."""
 
-COMMUNITY_REPORT_PROMPT = """
+COMMUNITY_REPORT_TEXT_PROMPT = """
 You are an AI assistant that helps a human analyst to perform general information discovery.
 Information discovery is the process of identifying and assessing relevant information associated with certain entities (e.g., organizations and individuals) within a network.
 
diff --git a/tests/unit/config/utils.py b/tests/unit/config/utils.py
@@ -418,7 +418,8 @@ def assert_summarize_descriptions_configs(
 def assert_community_reports_configs(
     actual: CommunityReportsConfig, expected: CommunityReportsConfig
 ) -> None:
-    assert actual.prompt == expected.prompt
+    assert actual.graph_prompt == expected.graph_prompt
+    assert actual.text_prompt == expected.text_prompt
     assert actual.max_length == expected.max_length
     assert actual.max_input_length == expected.max_input_length
     assert actual.strategy == expected.strategy
diff --git a/tests/verbs/test_create_community_reports.py b/tests/verbs/test_create_community_reports.py
@@ -61,6 +61,7 @@ async def test_create_community_reports():
     config.community_reports.strategy = {
         "type": "graph_intelligence",
         "llm": llm_settings,
+        "graph_prompt": "",
     }
 
     await run_workflow(

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "patch",
 +  "description": "Export NLP community reports prompt."
 +}
Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,8 @@ async def create_community_reports(`
`46`	`46`	`if claims_input is not None:`
`47`	`47`	`claims = _prep_claims(claims_input)`
`48`	`48`
	`49`	`+ summarization_strategy["extraction_prompt"] = summarization_strategy["graph_prompt"]`
	`50`	`+`
`49`	`51`	`max_input_length = summarization_strategy.get(`
`50`	`52`	`"max_input_length", defaults.COMMUNITY_REPORT_MAX_INPUT_LENGTH`
`51`	`53`	`)`
Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,7 @@ async def test_create_community_reports():`
`61`	`61`	`config.community_reports.strategy = {`
`62`	`62`	`"type": "graph_intelligence",`
`63`	`63`	`"llm": llm_settings,`
	`64`	`+ "graph_prompt": "",`
`64`	`65`	`}`
`65`	`66`
`66`	`67`	`await run_workflow(`