Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
Original file line number Diff line number Diff line change
@@ -0,0 +1,301 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import re\n",
"import os\n",
"from datetime import datetime"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"logging.basicConfig(filename='flood_data_extraction.log', level=logging.INFO,\n",
" format='%(asctime)s - %(levelname)s - %(message)s')"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"def get_csv_files(directory):\n",
" try:\n",
" return [f for f in os.listdir(directory) if f.endswith('.csv') and f.startswith('DRIMS')]\n",
" except FileNotFoundError:\n",
" logging.error(f\"Directory not found: {directory}\")\n",
" return []\n",
" except PermissionError:\n",
" logging.error(f\"Permission denied to access directory: {directory}\")\n",
" return []\n",
" except Exception as e:\n",
" logging.error(f\"Error accessing directory {directory}: {str(e)}\")\n",
" return []"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def clean_text(text):\n",
" return re.sub(r'\\s+', ' ', text).strip()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def extract_date(filename):\n",
" match = re.search(r'(\\d{2}\\.\\d{2}\\.\\d{4})', filename)\n",
" if match:\n",
" return datetime.strptime(match.group(1), '%d.%m.%Y').strftime('%Y-%m-%d')\n",
" return None\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def parse_complex_cell(cell):\n",
" if isinstance(cell, str) and '|' in cell:\n",
" parts = re.findall(r'\\(([^|]+)\\s*\\|\\s*(\\d+)\\)', cell)\n",
" return {name.strip():int(count) for name, count in parts}\n",
" return cell"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def process_csv(csv_file):\n",
" df=pd.read_csv(csv_file, header=None, skip_blank_lines=True)\n",
" data={}\n",
" current_section=None\n",
"\n",
" for _, row in df.iterrows():\n",
" if pd.notna(row[0]) and isinstance(row[0], str) and not row[0].isdigit():\n",
" current_section=clean_text(row[0])\n",
" data[current_section]=[]\n",
" elif current_section and pd.notna(row[1]):\n",
" cleaned_row=[clean_text(str(cell)) if pd.notna(cell) else '' for cell in row]\n",
" parsed_row=[parse_complex_cell(cell) for cell in cleaned_row]\n",
" data[current_section].append(parsed_row)\n",
" return data\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def extract_bridges_damaged(data):\n",
" bridges=data.get('Infrastructure Damaged - Bridge', [])\n",
" return pd.DataFrame(bridges[1:], columns=bridges[0])\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def extract_embankment_affected(data):\n",
" embankments = data.get('Infrastructure Damaged - Embankment Affected', [])\n",
" return pd.DataFrame(embankments[1:], columns=embankments[0])\n",
"\n",
"def extract_embankment_breached(data):\n",
" embankments = data.get('Infrastructure Damaged - Embankment Breached', [])\n",
" return pd.DataFrame(embankments[1:], columns=embankments[0])\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def extract_roads_damaged(data):\n",
" roads = data.get('Infrastructure Damaged - Road', [])\n",
" return pd.DataFrame(roads[1:], columns=roads[0])\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def extract_human_life_lost(data):\n",
" lives_lost = data.get('Human Lives Lost - Confirmed', [])\n",
" return pd.DataFrame(lives_lost[1:], columns=lives_lost[0])\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def extract_population_affected(data):\n",
" population = data.get('Population And Crop Area Affected', [])\n",
" df = pd.DataFrame(population[1:], columns=population[0])\n",
" \n",
" # Handle complex cells in 'Revenue Circle' column\n",
" if 'Revenue Circle' in df.columns:\n",
" df['Revenue Circle'] = df['Revenue Circle'].apply(lambda x: x if isinstance(x, dict) else {})\n",
" \n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def extract_villages_affected(data):\n",
" villages = data.get('Villages Affected', [])\n",
" df = pd.DataFrame(villages[1:], columns=villages[0])\n",
" \n",
" # Handle complex cells in 'Revenue Circle' column\n",
" if 'Revenue Circle' in df.columns:\n",
" df['Revenue Circle'] = df['Revenue Circle'].apply(lambda x: x if isinstance(x, dict) else {})\n",
" \n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"def main(csv_directory):\n",
" csv_files = get_csv_files(csv_directory)\n",
" \n",
" if not csv_files:\n",
" logging.warning(f\"No matching CSV files found in the directory: {csv_directory}\")\n",
" return\n",
"\n",
" logging.info(f\"Found {len(csv_files)} CSV files to process.\")\n",
"\n",
" for csv_file in csv_files:\n",
" full_path = os.path.join(csv_directory, csv_file)\n",
" logging.info(f\"Processing file: {full_path}\")\n",
" \n",
" date = extract_date(csv_file)\n",
" if not date:\n",
" logging.warning(f\"Could not extract date from filename: {csv_file}\")\n",
" continue\n",
"\n",
" data = process_csv(full_path)\n",
" if data is None:\n",
" logging.warning(f\"Skipping file due to processing error: {csv_file}\")\n",
" continue\n",
"\n",
" extractions = {\n",
" 'BRIDGES_DAMAGED': extract_bridges_damaged,\n",
" 'EMBANKMENT_AFFECTED': extract_embankment_affected,\n",
" 'EMBANKMENT_BREACHED': extract_embankment_breached,\n",
" 'ROADS_DAMAGED': extract_roads_damaged,\n",
" 'HUMAN_LIFE_LOST': extract_human_life_lost,\n",
" 'POPULATION_AFFECTED': extract_population_affected,\n",
" 'VILLAGES_AFFECTED': extract_villages_affected\n",
" }\n",
"\n",
" for name, extract_func in extractions.items():\n",
" try:\n",
" df = extract_func(data)\n",
" if not df.empty:\n",
" output_file = f\"{name}_{date}.csv\"\n",
" df.to_csv(output_file, index=False)\n",
" logging.info(f\"Created {output_file}\")\n",
" else:\n",
" logging.warning(f\"No data extracted for {name} from {csv_file}\")\n",
" except Exception as e:\n",
" logging.error(f\"Error extracting {name} from {csv_file}: {str(e)}\")\n",
"\n",
" # Additional data extraction\n",
" try:\n",
" relief_camps = data.get('Relief Camps / Centres Opened', [])\n",
" if relief_camps:\n",
" df_relief_camps = pd.DataFrame(relief_camps[1:], columns=relief_camps[0])\n",
" output_file = f\"RELIEF_CAMPS_{date}.csv\"\n",
" df_relief_camps.to_csv(output_file, index=False)\n",
" logging.info(f\"Created {output_file}\")\n",
" else:\n",
" logging.warning(f\"No relief camps data found in {csv_file}\")\n",
"\n",
" animals_affected = data.get('Animals Affected', [])\n",
" if animals_affected:\n",
" df_animals = pd.DataFrame(animals_affected[1:], columns=animals_affected[0])\n",
" output_file = f\"ANIMALS_AFFECTED_{date}.csv\"\n",
" df_animals.to_csv(output_file, index=False)\n",
" logging.info(f\"Created {output_file}\")\n",
" else:\n",
" logging.warning(f\"No animals affected data found in {csv_file}\")\n",
" except Exception as e:\n",
" logging.error(f\"Error processing additional data from {csv_file}: {str(e)}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"if __name__ == \"__main__\":\n",
" csv_directory = \"/home/prajna/civicdatalab/IDS-DRR-Assam/Archive/Scrapers/FRIMS_Daily_Reports_Scraper/DRIMS_Reports_2024\"\n",
" \n",
" if not os.path.isdir(csv_directory):\n",
" logging.error(f\"The specified directory does not exist: {csv_directory}\")\n",
" print(f\"Error: The specified directory does not exist: {csv_directory}\")\n",
" else:\n",
" main(csv_directory)\n",
"\n",
" logging.info(\"Script execution completed.\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "ids-drr-assam",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading