diff --git a/.gitignore b/.gitignore index 1e629c60..73f59117 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ !demographics.csv !incomes_projection.csv !policyengine_uk_data/datasets/local_areas/**/*.csv +!policyengine_uk_data/datasets/firm/**/*.csv **/_build !policyengine_uk_data/storage/*.csv **/version.json diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..ce154ba5 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,3 @@ +- bump: minor + changes: + - Firm synthetic data generation script with ONS and HMRC data integration \ No newline at end of file diff --git a/policyengine_uk_data/datasets/firm.py b/policyengine_uk_data/datasets/firm.py new file mode 100644 index 00000000..2322066e --- /dev/null +++ b/policyengine_uk_data/datasets/firm.py @@ -0,0 +1,234 @@ +""" +Firm dataset for PolicyEngine UK. + +This module processes synthetic firm data into PolicyEngine UK dataset format, +handling firm demographics, turnover, VAT, employment, and other business variables. +The synthetic firm data represents the UK business population for tax-benefit modelling. +""" + +from policyengine_core.data import Dataset +from pathlib import Path +import pandas as pd +import numpy as np +from policyengine_uk_data.utils.datasets import STORAGE_FOLDER +import logging + +logger = logging.getLogger(__name__) + + +def create_firm(year: int = 2023): + """ + Process synthetic firm data into PolicyEngine UK dataset format. + + Generates synthetic firm microdata and transforms it into a structured + PolicyEngine UK dataset with firm, sector, and employment-level variables + mapped to the appropriate tax-benefit system variables. + + Args: + year: Survey year for the dataset. + + Returns: + Dataset with processed firm data ready for policy simulation. + """ + # Always generate fresh synthetic data using generate_synthetic_data.py + logger.info("Generating synthetic firm data...") + import sys + + sys.path.append(str(Path(__file__).parent / "firm")) + from generate_synthetic_data import SyntheticFirmGenerator + + generator = SyntheticFirmGenerator(device="cpu") + synthetic_df = generator.generate_synthetic_firms() + + # Create entity DataFrames for firm structure + pe_firm = pd.DataFrame() + pe_sector = pd.DataFrame() + pe_business_group = pd.DataFrame() + + # Add primary keys and identifiers + pe_firm["firm_id"] = range(len(synthetic_df)) + pe_firm["firm_sector_id"] = synthetic_df["sic_code"].astype(int) + pe_firm["firm_business_group_id"] = pe_firm["firm_id"] // 100 + + # Create unique sectors + unique_sectors = synthetic_df["sic_code"].astype(int).unique() + pe_sector["sector_id"] = unique_sectors + + # Create business groups + unique_groups = pe_firm["firm_business_group_id"].unique() + pe_business_group["business_group_id"] = unique_groups + + # Add grossing weights + pe_firm["firm_weight"] = synthetic_df["weight"].values + + # Add basic firm variables - exactly from synthetic data + pe_firm["sic_code"] = synthetic_df["sic_code"] + pe_firm["annual_turnover_k"] = synthetic_df["annual_turnover_k"].values + pe_firm["annual_input_k"] = synthetic_df["annual_input_k"].values + pe_firm["vat_liability_k"] = synthetic_df["vat_liability_k"].values + pe_firm["employment"] = synthetic_df["employment"].astype(int).values + pe_firm["vat_registered"] = ( + synthetic_df["vat_registered"].astype(bool).values + ) + + # Add derived variables using pd.cut for efficiency + turnover_bins = [0, 85, 150, 300, 500, 1000, 10000, float("inf")] + turnover_labels = [ + "£1_to_Threshold", + "£Threshold_to_£150k", + "£150k_to_£300k", + "£300k_to_£500k", + "£500k_to_£1m", + "£1m_to_£10m", + "Greater_than_£10m", + ] + pe_firm["hmrc_band"] = ( + pd.cut( + pe_firm["annual_turnover_k"], + bins=turnover_bins, + labels=turnover_labels, + include_lowest=True, + ) + .astype(str) + .replace("nan", "Negative_or_Zero") + ) + + employment_bins = [0, 4, 9, 19, 49, 99, 249, float("inf")] + employment_labels = [ + "0-4", + "5-9", + "10-19", + "20-49", + "50-99", + "100-249", + "250+", + ] + pe_firm["employment_band"] = pd.cut( + pe_firm["employment"], + bins=employment_bins, + labels=employment_labels, + include_lowest=True, + ).astype(str) + + pe_firm["sic_numeric"] = pe_firm["sic_code"].astype(int) + + # Add year field + pe_firm["year"] = year + pe_sector["year"] = year + pe_business_group["year"] = year + + # Create the dataset - use a simple object to hold the data + class FirmDataset: + def __init__(self): + self.firm = pe_firm + self.sector = pe_sector + self.business_group = pe_business_group + + def save(self, path): + # Save as HDF5 for compatibility + self.firm.to_hdf(path, key="firm", mode="w") + self.sector.to_hdf(path, key="sector", mode="a") + self.business_group.to_hdf(path, key="business_group", mode="a") + + dataset = FirmDataset() + + # Add metadata about the dataset + dataset.metadata = { + "source": "synthetic_firm_generator", + "year": year, + "n_firms": len(pe_firm), + "total_weighted_firms": pe_firm["firm_weight"].sum(), + "vat_registered_firms": pe_firm[pe_firm["vat_registered"]][ + "firm_weight" + ].sum(), + "total_employment": ( + pe_firm["employment"] * pe_firm["firm_weight"] + ).sum(), + "total_turnover_billions": ( + pe_firm["annual_turnover_k"] * pe_firm["firm_weight"] + ).sum() + / 1e6, + "total_vat_liability_billions": ( + pe_firm["vat_liability_k"] * pe_firm["firm_weight"] + ).sum() + / 1e6, + } + + logger.info(f"Created firm dataset with {len(pe_firm):,} firms") + logger.info( + f"Total weighted population: {dataset.metadata['total_weighted_firms']:,.0f}" + ) + logger.info( + f"Total employment: {dataset.metadata['total_employment']:,.0f}" + ) + logger.info( + f"Total turnover: £{dataset.metadata['total_turnover_billions']:.1f}bn" + ) + + return dataset + + +# Dataset class for direct import like FRS +class firm_2023_24: + """UK Firm dataset for 2023-24, following the FRS pattern.""" + + def __init__(self): + # Load the dataset from storage or create if needed + dataset_path = STORAGE_FOLDER / "firm_2023_24.h5" + + if dataset_path.exists(): + self.firm = pd.read_hdf(dataset_path, key="firm") + self.sector = pd.read_hdf(dataset_path, key="sector") + self.business_group = pd.read_hdf( + dataset_path, key="business_group" + ) + else: + # Create and save the dataset + dataset = create_firm(year=2023) + dataset.save(dataset_path) + self.firm = dataset.firm + self.sector = dataset.sector + self.business_group = dataset.business_group + + +# Main execution for testing +if __name__ == "__main__": + """Test the firm dataset creation.""" + logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" + ) + + logger.info("Creating firm dataset...") + + # Create the dataset + firm_dataset = create_firm(year=2023) + + # Save to storage + output_path = STORAGE_FOLDER / "firm_2023_24.h5" + firm_dataset.save(output_path) + + logger.info(f"Saved firm dataset to {output_path}") + + # Display summary statistics + print("\n" + "=" * 60) + print("FIRM DATASET SUMMARY") + print("=" * 60) + + for key, value in firm_dataset.metadata.items(): + if isinstance(value, (int, float)): + if "billions" in key: + print(f"{key}: £{value:.2f}bn") + elif key in [ + "n_firms", + "total_weighted_firms", + "vat_registered_firms", + "total_employment", + ]: + print(f"{key}: {value:,.0f}") + else: + print(f"{key}: {value}") + else: + print(f"{key}: {value}") + + print("=" * 60) + print("Dataset creation complete!") diff --git a/policyengine_uk_data/datasets/firm/HMRC_VAT_annual_statistics/raw_data/Annual_UK_VAT_Statistics_2023-24.xls b/policyengine_uk_data/datasets/firm/HMRC_VAT_annual_statistics/raw_data/Annual_UK_VAT_Statistics_2023-24.xls new file mode 100644 index 00000000..bdc7efab Binary files /dev/null and b/policyengine_uk_data/datasets/firm/HMRC_VAT_annual_statistics/raw_data/Annual_UK_VAT_Statistics_2023-24.xls differ diff --git a/policyengine_uk_data/datasets/firm/HMRC_VAT_annual_statistics/vat_liability_by_sector.csv b/policyengine_uk_data/datasets/firm/HMRC_VAT_annual_statistics/vat_liability_by_sector.csv new file mode 100644 index 00000000..9d9aea39 --- /dev/null +++ b/policyengine_uk_data/datasets/firm/HMRC_VAT_annual_statistics/vat_liability_by_sector.csv @@ -0,0 +1,89 @@ +Trade_Sector,Trade_Sub_Sector,2023-24 +00001,"Crop and animal production, hunting and related service activities",-2330 +00002,Forestry and logging,100 +00003,Fishing and aquaculture,-100 +00005,Mining of coal and lignite,10 +00006,Extraction of crude petroleum and natural gas,-460 +00007,Mining of metal ores,-10 +00008,Other mining and quarrying,270 +00009,Mining and support service activities,-330 +00010,Manufacture of food products,-2150 +00011,Manufacture of beverages,1820 +00012,Manufacture of tobacco products,1700 +00013,Manufacture of textiles,450 +00014,Manufacture of wearing apparel,280 +00015,Manufacture of leather and related products,140 +00016,"Manufacture of wood and of products of wood and cork, except furniture; manufacture of articles of straw and plaiting materials",650 +00017,Manufacture of paper and paper products,920 +00018,Printing and reproduction of recorded media,480 +00019,Manufacture of coke and refined petroleum products,7810 +00020,Manufacture of chemicals and chemical products,1500 +00021,Manufacture of basic pharmaceutical products and pharmaceutical preparations,570 +00022,Manufacture of rubber and rubber products,1440 +00023,Manufacture of other non-metallic mineral products,1130 +00024,Manufacture of basic metals,-830 +00025,"Manufacture of fabricated metal products, except machinery and equipment",2000 +00026,"Manufacture of computer, electronic and optical products",1050 +00027,Manufacture of electrical equipment,770 +00028,Manufacture of machinery and equipment n.e.c,1150 +00029,"Manufacture of motor vehicles, trailers and semi-trailers",2040 +00030,Manufacture of other transport equipment,-1240 +00031,Manufacture of furniture,730 +00032,Other manufacturing,1090 +00033,Repair and installation of machinery and equipment,810 +00035,"Electricity, gas, steam and air conditioning supply",6020 +00036,"Water collection, treatment and supply",-2250 +00037,Sewerage,-10 +00038,"Waste collection, treatment and disposal activities; materials recovery",680 +00039,Remediation activities and other waste management services,370 +00041,Construction of buildings,2780 +00042,Civil engineering,4130 +00043,Specialised construction activities,3950 +00045,Wholesale and retail trade and repair of motor vehicles and motorcycles,13040 +00046,"Wholesale trade, except of motor vehicles and motorcycles",25980 +00047,Retail trade except of motor vehicles and motorcycles,16440 +00049,Land transport and transport via pipelines,-420 +00050,Water transport,-100 +00051,Air transport,-1080 +00052,Warehousing and support activities for transportation,1930 +00053,Postal and courier activities,1140 +00055,Accommodation,2760 +00056,Food and beverage service activities,8450 +00058,Publishing activities,700 +00059,"Motion picture, video and television programme production, sound recording and music publishing activities",210 +00060,Programming and broadcasting activities,-20 +00061,Telecommunications,4320 +00062,"Computer programming, consultancy and related activities",13720 +00063,Information service activities,460 +00064,"Financial service activities, except insurance and pension funding",-190 +00065,"Insurance, reinsurance and pension funding, except compulsory social security",200 +00066,Activities auxiliary to financial services and insurance activities,1180 +00068,Real estate activities,4180 +00069,Legal and accounting services,9120 +00070,Activities of head offices; management consultancy services,9010 +00071,Architectural and engineering activities; technical testing and analysis,4830 +00072,Scientific research and development,160 +00073,Advertising and market research,1990 +00074,"Other professional, scientific and technical activities",1560 +00075,Veterinary activities,770 +00077,Rental and leasing activities,3410 +00078,Employment activities,6000 +00079,"Travel agency, tour operator and other reservation service and related activities",-30 +00080,Security and investigation activities,1050 +00081,Services to buildings and landscape activities,2150 +00082,Office administrative and support activities,6230 +00084,Public administration and defence; compulsory social security,-11080 +00085,Education,410 +00086,Human health activities,60 +00087,Residential care activities,260 +00088,Social work activities without accommodation,370 +00090,"Creative, arts and entertainment activities",1690 +00091,"Libraries, archives, museums and other cultural activities",40 +00092,Gambling and betting activities,50 +00093,Sports activities and amusement and recreation activities,1680 +00094,Activities of membership organisations,520 +00095,Repair of computers and personal and household goods,280 +00096,Other personal service activities,2780 +00097,Activities of households as employers of domestic personnel,0 +00098,Undifferentiated goods- and services-producing activities of private households for own use,0 +00099,Activities of extraterritorial organisations and bodies,40 diff --git a/policyengine_uk_data/datasets/firm/HMRC_VAT_annual_statistics/vat_liability_by_turnover_band.csv b/policyengine_uk_data/datasets/firm/HMRC_VAT_annual_statistics/vat_liability_by_turnover_band.csv new file mode 100644 index 00000000..e8dd09f9 --- /dev/null +++ b/policyengine_uk_data/datasets/firm/HMRC_VAT_annual_statistics/vat_liability_by_turnover_band.csv @@ -0,0 +1,21 @@ +Financial_Year,Negative_or_Zero,£1_to_Threshold,£Threshold_to_£150k,£150k_to_£300k,£300k_to_£500k,£500k_to_£1m,£1m_to_£10m,Greater_than_£10m,Total +2004-05,-980,150,2660,3020,2650,4100,14080,35270,60940 +2005-06,-1850,200,2630,2980,2640,3880,13060,32450,55980 +2006-07,-1490,170,2780,3150,2810,4250,14750,36330,62760 +2007-08,-1700,90,2680,3130,2720,4410,15900,40920,68140 +2008-09,-1540,120,2750,3060,2640,4150,15030,37590,63810 +2009-10,-960,470,2490,2770,2340,3550,14200,38780,63640 +2010-11,-1010,570,2880,3220,2630,4120,15750,41020,69200 +2011-12,-1300,680,3170,3760,2940,4630,17180,47370,78430 +2012-13,-1270,820,3190,3810,2980,4710,17700,50140,82070 +2013-14,-1490,800,3190,4000,3150,4800,19020,54700,88170 +2014-15,-1770,800,3300,4250,3220,5090,20210,57960,93060 +2015-16,-2230,720,3420,4370,3400,5300,21060,61120,97160 +2016-17,-2200,750,3510,4700,3460,5470,21460,62960,100110 +2017-18,-2250,770,3740,4830,3650,5640,22740,63820,102930 +2018-19,-2270,820,3760,4980,3650,5690,23360,65300,105300 +2019-20,-2270,770,3730,5170,3790,5870,23820,68270,109130 +2020-21,-2220,440,2530,3540,2770,4550,19620,67000,98230 +2021-22,-2500,1260,2290,3600,3060,5040,22720,105180,140650 +2022-23,-3050,-10,2300,4380,3690,5880,26340,117230,156740 +2023-24,-2640,1460,2430,4590,3900,6170,27520,129960,173380 diff --git a/policyengine_uk_data/datasets/firm/HMRC_VAT_annual_statistics/vat_population_by_sector.csv b/policyengine_uk_data/datasets/firm/HMRC_VAT_annual_statistics/vat_population_by_sector.csv new file mode 100644 index 00000000..1f94fc1c --- /dev/null +++ b/policyengine_uk_data/datasets/firm/HMRC_VAT_annual_statistics/vat_population_by_sector.csv @@ -0,0 +1,90 @@ +Trade_Sector,Trade_Sub_Sector,2023-24 +00001,"Crop and animal production, hunting and related service activities",117930 +00002,Forestry and logging,5150 +00003,Fishing and aquaculture,3790 +00005,Mining of coal and lignite,30 +00006,Extraction of crude petroleum and natural gas,230 +00007,Mining of metal ores,80 +00008,Other mining and quarrying,780 +00009,Mining and support service activities,1470 +00010,Manufacture of food products,8840 +00011,Manufacture of beverages,3190 +00012,Manufacture of tobacco products,70 +00013,Manufacture of textiles,3940 +00014,Manufacture of wearing apparel,2970 +00015,Manufacture of leather and related products,710 +00016,"Manufacture of wood and of products of wood and cork, except furniture; manufacture of articles of straw and plaiting materials",8010 +00017,Manufacture of paper and paper products,1570 +00018,Printing and reproduction of recorded media,9560 +00019,Manufacture of coke and refined petroleum products,180 +00020,Manufacture of chemicals and chemical products,3430 +00021,Manufacture of basic pharmaceutical products and pharmaceutical preparations,790 +00022,Manufacture of rubber and rubber products,4590 +00023,Manufacture of other non-metallic mineral products,4060 +00024,Manufacture of basic metals,2320 +00025,"Manufacture of fabricated metal products, except machinery and equipment",19170 +00026,"Manufacture of computer, electronic and optical products",5630 +00027,Manufacture of electrical equipment,2690 +00028,Manufacture of machinery and equipment n.e.c,7440 +00029,"Manufacture of motor vehicles, trailers and semi-trailers",2970 +00030,Manufacture of other transport equipment,1970 +00031,Manufacture of furniture,5590 +00032,Other manufacturing,9760 +00033,Repair and installation of machinery and equipment,12640 +00035,"Electricity, gas, steam and air conditioning supply",7120 +00036,"Water collection, treatment and supply",630 +00037,Sewerage,840 +00038,"Waste collection, treatment and disposal activities; materials recovery",4970 +00039,Remediation activities and other waste management services,1180 +00041,Construction of buildings,108430 +00042,Civil engineering,20940 +00043,Specialised construction activities,195830 +00045,Wholesale and retail trade and repair of motor vehicles and motorcycles,67550 +00046,"Wholesale trade, except of motor vehicles and motorcycles",98080 +00047,Retail trade except of motor vehicles and motorcycles,290780 +00049,Land transport and transport via pipelines,51830 +00050,Water transport,1370 +00051,Air transport,1070 +00052,Warehousing and support activities for transportation,11610 +00053,Postal and courier activities,28550 +00055,Accommodation,17460 +00056,Food and beverage service activities,106570 +00058,Publishing activities,10980 +00059,"Motion picture, video and television programme production, sound recording and music publishing activities",29320 +00060,Programming and broadcasting activities,1410 +00061,Telecommunications,6750 +00062,"Computer programming, consultancy and related activities",116650 +00063,Information service activities,6640 +00064,"Financial service activities, except insurance and pension funding",12770 +00065,"Insurance, reinsurance and pension funding, except compulsory social security",2900 +00066,Activities auxiliary to financial services and insurance activities,6140 +00068,Real estate activities,107180 +00069,Legal and accounting services,56760 +00070,Activities of head offices; management consultancy services,131400 +00071,Architectural and engineering activities; technical testing and analysis,62270 +00072,Scientific research and development,5650 +00073,Advertising and market research,18930 +00074,"Other professional, scientific and technical activities",45160 +00075,Veterinary activities,3470 +00077,Rental and leasing activities,17530 +00078,Employment activities,24470 +00079,"Travel agency, tour operator and other reservation service and related activities",6550 +00080,Security and investigation activities,8090 +00081,Services to buildings and landscape activities,35200 +00082,Office administrative and support activities,47640 +00084,Public administration and defence; compulsory social security,3400 +00085,Education,20920 +00086,Human health activities,9160 +00087,Residential care activities,1550 +00088,Social work activities without accommodation,2990 +00090,"Creative, arts and entertainment activities",27010 +00091,"Libraries, archives, museums and other cultural activities",1080 +00092,Gambling and betting activities,370 +00093,Sports activities and amusement and recreation activities,30820 +00094,Activities of membership organisations,8320 +00095,Repair of computers and personal and household goods,4670 +00096,Other personal service activities,37910 +00097,Activities of households as employers of domestic personnel,130 +00098,Undifferentiated goods- and services-producing activities of private households for own use,140 +00099,Activities of extraterritorial organisations and bodies,60 +Total,,2178950 diff --git a/policyengine_uk_data/datasets/firm/HMRC_VAT_annual_statistics/vat_population_by_turnover_band.csv b/policyengine_uk_data/datasets/firm/HMRC_VAT_annual_statistics/vat_population_by_turnover_band.csv new file mode 100644 index 00000000..c25922db --- /dev/null +++ b/policyengine_uk_data/datasets/firm/HMRC_VAT_annual_statistics/vat_population_by_turnover_band.csv @@ -0,0 +1,21 @@ +Financial_Year,Negative_or_Zero,£1_to_Threshold,£Threshold_to_£150k,£150k_to_£300k,£300k_to_£500k,£500k_to_£1m,£1m_to_£10m,Greater_than_£10m,Unknown,Total +2004-05,155820,559510,451390,257420,134860,121810,142720,23500,0,1847030 +2005-06,191450,602910,440700,254460,132240,118110,139290,23710,40050,1942920 +2006-07,269480,578240,446800,267030,141200,127650,154820,26160,0,2011380 +2007-08,270060,617920,433270,268310,142380,131830,160530,26780,0,2051080 +2008-09,263770,618080,417410,270400,141260,131860,158740,26830,42340,2070690 +2009-10,269450,625610,393750,258250,135420,124520,148210,25190,39800,2020180 +2010-11,233300,627540,387630,263380,137040,126160,151690,26350,44080,1997160 +2011-12,196770,641310,378070,270950,139110,129300,156260,27560,56110,1995440 +2012-13,197580,666670,358250,272340,138120,128940,156880,27800,54090,2000650 +2013-14,219850,680150,353640,280500,141150,132160,163040,29270,61470,2061220 +2014-15,226320,702100,352280,290160,144850,137210,170760,30810,73580,2128080 +2015-16,246320,740080,359750,297390,147780,139440,175940,31490,79670,2217840 +2016-17,265640,777330,362070,307320,151550,142730,181640,33170,87140,2308580 +2017-18,292510,752520,342910,306610,154870,146150,187480,34720,113980,2331730 +2018-19,310900,741550,346830,314620,157810,149410,191820,36130,103180,2352250 +2019-20,413970,732800,345250,318210,158960,150540,194580,36830,103750,2454910 +2020-21,462810,834440,297100,281300,147050,141160,180320,34100,103050,2481330 +2021-22,449120,773120,294410,312460,172520,166910,214490,40230,131220,2554470 +2022-23,402860,705960,307250,331490,181670,176740,231290,44090,11440,2392790 +2023-24,216500,678350,305320,334470,184080,180500,235060,44680,0,2178950 diff --git a/policyengine_uk_data/datasets/firm/ONS_UK_business_data/firm_employment.csv b/policyengine_uk_data/datasets/firm/ONS_UK_business_data/firm_employment.csv new file mode 100644 index 00000000..1fd0a7fe --- /dev/null +++ b/policyengine_uk_data/datasets/firm/ONS_UK_business_data/firm_employment.csv @@ -0,0 +1,90 @@ +SIC Code,Description,0-4,5-9,10-19,20-49,50-99,100-249,250+,Total +01,Crop and animal production; hunting and related service activities,119405,11525,3095,1195,350,155,90,135815 +02,Forestry and logging,4180,530,250,105,25,5,0,5095 +03,Fishing and aquaculture,3640,345,125,25,10,0,0,4145 +05,Mining of coal and lignite,5,0,0,0,0,0,0,5 +06,Extraction of crude petroleum and natural gas,70,15,10,15,10,15,15,150 +07,Mining of metal ores,5,0,0,0,0,0,0,5 +08,Other mining and quarrying,725,300,220,150,45,20,5,1465 +09,Mining support service activities,260,40,25,35,20,15,15,410 +10,Manufacture of food products,5315,1870,1365,1050,495,440,405,10940 +11,Manufacture of beverages,1910,445,315,200,80,60,45,3055 +12,Manufacture of tobacco products,0,5,0,0,0,0,0,5 +13,Manufacture of textiles,2875,765,455,310,125,60,10,4600 +14,Manufacture of wearing apparel,2830,515,255,160,50,10,5,3825 +15,Manufacture of leather and related products,415,160,65,40,15,5,5,705 +16,Manufacture of wood and of products of wood and cork; except furniture; manufacture of articles of straw and plaiting materials,7955,1395,795,510,175,60,15,10905 +17,Manufacture of paper and paper products,705,190,195,235,135,130,30,1620 +18,Printing and reproduction of recorded media,7245,1535,725,470,150,90,20,10235 +19,Manufacture of coke and refined petroleum products,40,15,10,15,15,10,10,115 +20,Manufacture of chemicals and chemical products,2000,505,350,435,230,160,65,3745 +21,Manufacture of basic pharmaceutical products and pharmaceutical preparations,400,75,40,75,50,40,55,735 +22,Manufacture of rubber and plastic products,2460,1095,835,805,415,265,65,5940 +23,Manufacture of other non-metallic mineral products,2800,925,615,465,215,130,30,5180 +24,Manufacture of basic metals,1070,275,235,250,105,65,35,2035 +25,Manufacture of fabricated metal products; except machinery and equipment,13980,3870,2855,2145,715,280,70,23915 +26,Manufacture of computer; electronic and optical products,3335,850,560,610,265,165,55,5840 +27,Manufacture of electrical equipment,1640,480,420,430,195,120,40,3325 +28,Manufacture of machinery and equipment n.e.c.,4155,1205,865,845,410,225,105,7810 +29,Manufacture of motor vehicles; trailers and semi-trailers,2455,475,295,275,170,125,95,3890 +30,Manufacture of other transport equipment,1430,245,145,135,105,85,95,2240 +31,Manufacture of furniture,4450,1075,650,450,175,95,30,6925 +32,Other manufacturing,7045,1590,655,420,150,90,35,9985 +33,Repair and installation of machinery and equipment,11995,1695,985,580,180,130,50,15615 +35,Electricity; gas; steam and air conditioning supply,5200,730,390,275,145,155,95,6990 +36,Water collection; treatment and supply,375,170,145,165,65,45,40,1005 +37,Sewerage,975,260,175,180,45,35,5,1675 +38,Waste collection; treatment and disposal activities; materials recovery,4630,1480,1065,950,395,220,55,8795 +39,Remediation activities and other waste management services,845,210,135,85,15,5,0,1295 +41,Construction of buildings,110965,9690,4175,1665,625,395,80,127595 +42,Civil engineering,18605,2615,1330,955,415,205,95,24220 +43,Specialised construction activities,203530,21540,8795,4060,1010,375,70,239380 +45,Wholesale and retail trade and repair of motor vehicles and motorcycles,64535,16060,6460,4050,1130,330,50,92615 +46,Wholesale trade; except of motor vehicles and motorcycles,75485,21110,13315,7750,2300,1005,390,121355 +47,Retail trade; except of motor vehicles and motorcycles,185010,58960,35145,16905,3285,2255,1040,302600 +49,Land transport and transport via pipelines,52885,6295,3610,2665,1080,665,320,67520 +50,Water transport,1060,190,115,85,35,10,10,1505 +51,Air transport,950,115,55,70,35,30,50,1305 +52,Warehousing and support activities for transportation,16105,3795,2465,2145,950,680,440,26580 +53,Postal and courier activities,32395,1385,595,925,700,420,125,36545 +55,Accommodation,12930,3600,3375,3925,1435,630,170,26065 +56,Food and beverage service activities,91715,47075,32560,20325,2915,1345,135,196070 +58,Publishing activities,9985,1205,755,525,180,90,70,12810 +59,Motion picture; video and television programme production; sound recording and music publishing activities,30560,1500,900,745,270,95,30,34100 +60,Programming and broadcasting activities,1885,165,80,100,35,30,25,2320 +61,Telecommunications,7520,1440,1005,795,350,195,135,11440 +62,Computer programming; consultancy and related activities,107640,7760,5010,3550,1350,680,350,126340 +63,Information service activities,6740,830,575,325,115,70,55,8710 +64,Financial service activities; except insurance and pension funding,19050,3625,2290,1140,290,190,300,26885 +65,Insurance; reinsurance and pension funding; except compulsory social security,6970,170,130,100,50,60,105,7585 +66,Activities auxiliary to financial services and insurance activities,26920,4315,2270,1395,690,445,280,36315 +68,Real estate activities,107155,14675,6180,1825,475,335,195,130840 +69,Legal and accounting activities,59055,9660,5675,2980,960,585,355,79270 +70,Activities of head offices; management consultancy activities,142325,9000,5030,2955,1180,695,370,161555 +71,Architectural and engineering activities; technical testing and analysis,68340,7175,4330,2720,935,445,180,84125 +72,Scientific research and development,4070,790,660,535,250,165,135,6605 +73,Advertising and market research,19425,1960,1240,900,325,160,110,24120 +74,Other professional; scientific and technical activities,67880,5225,2245,955,220,75,25,76625 +75,Veterinary activities,2770,1460,1610,885,135,40,10,6910 +77,Rental and leasing activities,14710,3585,2190,1115,300,125,35,22060 +78,Employment activities,25925,4335,3385,2715,1715,1265,675,40015 +79,Travel agency; tour operator and other reservation service and related activities,7555,1875,695,350,140,80,30,10725 +80,Security and investigation activities,11060,1925,1175,980,365,205,130,15840 +81,Services to buildings and landscape activities,54515,9800,5185,2885,980,485,340,74190 +82,Office administrative; office support and other business support activities,89580,9640,4400,1950,760,420,295,107045 +84,Public administration and defence; compulsory social security,10920,3080,2885,3200,1615,1370,1325,24395 +85,Education,31990,8755,8850,14750,8445,4670,935,78395 +86,Human health activities,49340,11185,10020,7510,2355,1430,1050,82890 +87,Residential care activities,9530,5460,6725,7160,3885,870,85,33715 +88,Social work activities without accommodation,20760,12015,11720,8555,2270,845,255,56420 +90,Creative; arts and entertainment activities,29555,1665,820,405,170,100,20,32735 +91,Libraries; archives; museums and other cultural activities,2775,1205,835,625,255,125,40,5860 +92,Gambling and betting activities,4085,4160,500,320,95,40,10,9210 +93,Sports activities and amusement and recreation activities,27855,6455,4600,3650,1330,515,130,44535 +94,Activities of membership organisations,17845,4915,2240,1285,415,225,85,27010 +95,Repair of computers and personal and household goods,8545,875,345,210,55,40,20,10090 +96,Other personal service activities,69185,13545,4365,1170,195,95,25,88580 +97,Activities of households as employers of domestic personnel,0,0,0,0,0,0,0,0 +98,Undifferentiated goods- and services-producing activities of private households for own use,0,0,0,0,0,0,0,0 +99,Activities of extraterritorial organisations and bodies,0,0,0,0,5,0,0,5 +,Total,2269010,404720,242235,160880,54820,29115,12880,3173660 diff --git a/policyengine_uk_data/datasets/firm/ONS_UK_business_data/firm_turnover.csv b/policyengine_uk_data/datasets/firm/ONS_UK_business_data/firm_turnover.csv new file mode 100644 index 00000000..f8bf05f7 --- /dev/null +++ b/policyengine_uk_data/datasets/firm/ONS_UK_business_data/firm_turnover.csv @@ -0,0 +1,90 @@ +SIC Code,Description,0-49,50-99,100-249,250-499,500-999,1000-4999,5000+,Total +01,Crop and animal production; hunting and related service activities,42455,24050,29210,16385,11110,8190,1015,132415 +02,Forestry and logging,850,1035,1640,680,345,255,45,4850 +03,Fishing and aquaculture,1245,790,990,500,195,215,50,3985 +05,Mining of coal and lignite,0,0,0,0,0,0,5,5 +06,Extraction of crude petroleum and natural gas,10,10,10,15,10,15,40,110 +07,Mining of metal ores,0,0,0,0,0,0,0,0 +08,Other mining and quarrying,80,75,150,70,70,150,115,710 +09,Mining support service activities,45,50,120,25,20,40,70,370 +10,Manufacture of food products,1465,1360,2140,1205,870,1200,1170,9410 +11,Manufacture of beverages,645,465,665,330,260,275,135,2775 +12,Manufacture of tobacco products,0,0,0,0,0,0,0,0 +13,Manufacture of textiles,480,840,1260,625,415,505,215,4340 +14,Manufacture of wearing apparel,580,920,1145,460,290,255,85,3735 +15,Manufacture of leather and related products,125,135,140,55,45,65,30,595 +16,Manufacture of wood and of products of wood and cork; except furniture; manufacture of articles of straw and plaiting materials,720,2440,3630,1345,975,1025,380,10515 +17,Manufacture of paper and paper products,155,175,250,120,120,270,310,1400 +18,Printing and reproduction of recorded media,1740,1960,2850,1205,850,940,295,9840 +19,Manufacture of coke and refined petroleum products,5,5,10,5,5,10,45,85 +20,Manufacture of chemicals and chemical products,450,465,615,340,250,515,635,3270 +21,Manufacture of basic pharmaceutical products and pharmaceutical preparations,90,65,160,70,45,75,145,650 +22,Manufacture of rubber and plastic products,370,460,885,640,675,1310,890,5230 +23,Manufacture of other non-metallic mineral products,270,565,955,585,440,775,415,4005 +24,Manufacture of basic metals,170,230,415,220,205,325,280,1845 +25,Manufacture of fabricated metal products; except machinery and equipment,2000,4325,5060,3110,2615,4000,1475,22585 +26,Manufacture of computer; electronic and optical products,885,795,1120,570,465,905,655,5395 +27,Manufacture of electrical equipment,355,325,605,330,310,640,460,3025 +28,Manufacture of machinery and equipment n.e.c.,775,810,1515,870,760,1455,950,7135 +29,Manufacture of motor vehicles; trailers and semi-trailers,320,475,1230,405,310,455,380,3575 +30,Manufacture of other transport equipment,240,495,480,185,150,195,195,1940 +31,Manufacture of furniture,490,1155,2060,960,760,915,335,6675 +32,Other manufacturing,1470,2045,2625,1245,920,935,355,9595 +33,Repair and installation of machinery and equipment,1540,3755,4510,1845,1240,1360,440,14690 +35,Electricity; gas; steam and air conditioning supply,1210,640,1310,835,680,910,415,6000 +36,Water collection; treatment and supply,10,5,15,10,5,20,35,100 +37,Sewerage,95,200,350,215,150,165,50,1225 +38,Waste collection; treatment and disposal activities; materials recovery,480,605,1565,935,770,1170,685,6210 +39,Remediation activities and other waste management services,110,205,300,160,155,215,65,1210 +41,Construction of buildings,14640,14325,37080,25040,15385,12255,3100,121825 +42,Civil engineering,2165,3705,7230,3310,2160,2620,1215,22405 +43,Specialised construction activities,15930,56355,88660,34550,19270,17435,3770,235970 +45,Wholesale and retail trade and repair of motor vehicles and motorcycles,5695,12925,26270,15695,9030,8895,3115,81625 +46,Wholesale trade; except of motor vehicles and motorcycles,12565,10035,19705,14470,12770,20390,12030,101965 +47,Retail trade; except of motor vehicles and motorcycles,21735,30035,66940,42655,27695,21215,4345,214620 +49,Land transport and transport via pipelines,9325,15180,21980,6225,4100,4815,1550,63175 +50,Water transport,300,260,290,145,100,140,135,1370 +51,Air transport,155,130,105,300,155,85,120,1050 +52,Warehousing and support activities for transportation,1930,4245,3410,1845,2040,2550,1575,17595 +53,Postal and courier activities,20100,6175,4495,1315,710,690,200,33685 +55,Accommodation,2155,3080,6390,2950,2360,3095,1025,21055 +56,Food and beverage service activities,10165,27030,63160,29005,14375,7505,1215,152455 +58,Publishing activities,2580,2355,3765,1470,885,950,405,12410 +59,Motion picture; video and television programme production; sound recording and music publishing activities,5270,8610,11355,3890,1660,1805,720,33310 +60,Programming and broadcasting activities,315,430,835,245,105,105,65,2100 +61,Telecommunications,1085,1080,2750,1055,625,850,515,7960 +62,Computer programming; consultancy and related activities,20330,34310,44905,8900,5510,6865,2970,123790 +63,Information service activities,1455,2425,2210,685,570,585,265,8195 +64,Financial service activities; except insurance and pension funding,10135,975,3115,2365,1195,1000,900,19685 +65,Insurance; reinsurance and pension funding; except compulsory social security,6405,25,130,185,155,200,185,7285 +66,Activities auxiliary to financial services and insurance activities,8250,1190,10815,4240,2985,2730,1605,31815 +68,Real estate activities,24310,28790,33555,15220,7940,6280,1795,117890 +69,Legal and accounting activities,9640,16505,25585,9615,5330,4850,1480,73005 +70,Activities of head offices; management consultancy activities,25720,28145,66505,19080,7745,7230,2440,156865 +71,Architectural and engineering activities; technical testing and analysis,9245,22655,28290,7905,4615,5040,1775,79525 +72,Scientific research and development,1425,955,1425,595,425,625,400,5850 +73,Advertising and market research,3320,3665,8715,3225,1700,2020,905,23550 +74,Other professional; scientific and technical activities,9965,19125,32590,6620,3505,2890,675,75370 +75,Veterinary activities,275,740,750,415,505,665,100,3450 +77,Rental and leasing activities,2955,2625,4720,2525,1970,2530,965,18290 +78,Employment activities,2890,4120,9020,4840,3640,4915,2200,31625 +79,Travel agency; tour operator and other reservation service and related activities,1570,875,1925,1745,1085,1100,510,8810 +80,Security and investigation activities,920,1825,4195,1360,945,1115,315,10675 +81,Services to buildings and landscape activities,7060,10990,21090,7325,3295,2535,665,52960 +82,Office administrative; office support and other business support activities,9340,14280,33625,23885,13245,8060,1885,104320 +84,Public administration and defence; compulsory social security,325,4870,1515,400,160,185,390,7845 +85,Education,4255,10395,14345,6245,4840,5165,2750,47995 +86,Human health activities,3915,23290,16355,8070,5520,7115,1170,65435 +87,Residential care activities,1020,1265,1355,1350,1565,4180,1125,11860 +88,Social work activities without accommodation,1670,6145,9865,7150,5885,5000,780,36495 +90,Creative; arts and entertainment activities,5625,6715,12420,3545,1925,1760,375,32365 +91,Libraries; archives; museums and other cultural activities,250,315,465,280,230,270,110,1920 +92,Gambling and betting activities,55,100,210,180,115,130,90,880 +93,Sports activities and amusement and recreation activities,9080,7285,10975,5115,3175,2590,600,38820 +94,Activities of membership organisations,4830,4420,6820,3385,1600,1235,330,22620 +95,Repair of computers and personal and household goods,930,1885,4840,1010,440,380,120,9605 +96,Other personal service activities,13460,28750,27930,6390,2935,2050,390,81905 +97,Activities of households as employers of domestic personnel,0,0,0,0,0,0,0,0 +98,Undifferentiated goods- and services-producing activities of private households for own use,0,0,0,0,0,0,0,0 +99,Activities of extraterritorial organisations and bodies,0,0,0,0,0,0,0,0 +,Total,388665,537540,874665,384570,234660,226445,78230,2724775 diff --git a/policyengine_uk_data/datasets/firm/ONS_UK_business_data/raw_data/ukbusinessworkbook2024.xlsx b/policyengine_uk_data/datasets/firm/ONS_UK_business_data/raw_data/ukbusinessworkbook2024.xlsx new file mode 100644 index 00000000..9c1436e6 Binary files /dev/null and b/policyengine_uk_data/datasets/firm/ONS_UK_business_data/raw_data/ukbusinessworkbook2024.xlsx differ diff --git a/policyengine_uk_data/datasets/firm/generate_synthetic_data.py b/policyengine_uk_data/datasets/firm/generate_synthetic_data.py new file mode 100644 index 00000000..2ed49d9c --- /dev/null +++ b/policyengine_uk_data/datasets/firm/generate_synthetic_data.py @@ -0,0 +1,1827 @@ +#!/usr/bin/env python3 +""" +Synthetic firm data generation for UK business population. + +This script generates individual firm records from ONS business structure data, +calibrated to HMRC VAT registration statistics for accurate representation. +""" + +import logging +from pathlib import Path +from typing import Dict, Tuple +import sys + +import numpy as np +import pandas as pd +import torch +from torch import Tensor + +# Add parent directories to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) +sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + +try: + from policyengine_uk_data.storage import STORAGE_FOLDER +except ImportError: + # If can't import, use a default storage location + STORAGE_FOLDER = Path(__file__).parent.parent.parent / "storage" + if not STORAGE_FOLDER.exists(): + STORAGE_FOLDER.mkdir(parents=True, exist_ok=True) + +logger = logging.getLogger(__name__) + + +class SyntheticFirmGenerator: + """ + Synthetic firm data generator for UK business population analysis. + + Generates complete UK firm population calibrated to official data sources: + - ONS data for population structure and employment distribution + - HMRC data for VAT registration and sector targets + - Assigns VAT flags to identify HMRC-visible firms + """ + + def __init__(self, device: str = "cpu", random_seed: int = 42): + """Initialize the synthetic firm generator. + + Args: + device: Computing device ('cpu', 'cuda', 'mps') + random_seed: Random seed for reproducibility + """ + self.device = device + self.random_seed = random_seed + + # Set random seeds + torch.manual_seed(random_seed) + np.random.seed(random_seed) + + logger.info(f"Initialized firm generator on device: {device}") + + def load_data( + self, + ) -> Tuple[ + pd.DataFrame, + pd.DataFrame, + pd.DataFrame, + pd.DataFrame, + pd.DataFrame, + pd.DataFrame, + int, + ]: + """Load ONS and HMRC data files from standardized CSV sources. + + Returns: + Tuple containing ONS turnover data, ONS employment data, + HMRC turnover bands, HMRC sector data, VAT liability by sector, + VAT liability by turnover band, and ONS total firm count + """ + logger.info("Loading data files...") + + # Define paths to data files + project_root = Path(__file__).parent + ons_path = project_root / "ONS_UK_business_data" / "firm_turnover.csv" + ons_employment_path = ( + project_root / "ONS_UK_business_data" / "firm_employment.csv" + ) + hmrc_turnover_path = ( + project_root + / "HMRC_VAT_annual_statistics" + / "vat_population_by_turnover_band.csv" + ) + hmrc_sector_path = ( + project_root + / "HMRC_VAT_annual_statistics" + / "vat_population_by_sector.csv" + ) + vat_liability_sector_path = ( + project_root + / "HMRC_VAT_annual_statistics" + / "vat_liability_by_sector.csv" + ) + vat_liability_band_path = ( + project_root + / "HMRC_VAT_annual_statistics" + / "vat_liability_by_turnover_band.csv" + ) + + # Load CSV files + ons_df = pd.read_csv(ons_path) + ons_employment_df = pd.read_csv(ons_employment_path) + hmrc_turnover_df = pd.read_csv(hmrc_turnover_path) + hmrc_sector_df = pd.read_csv(hmrc_sector_path) + vat_liability_sector_df = pd.read_csv(vat_liability_sector_path) + vat_liability_band_df = pd.read_csv(vat_liability_band_path) + + logger.info(f"Loaded ONS turnover data: {len(ons_df)} rows") + logger.info( + f"Loaded ONS employment data: {len(ons_employment_df)} rows" + ) + logger.info(f"Loaded HMRC turnover data: {len(hmrc_turnover_df)} rows") + logger.info(f"Loaded HMRC sector data: {len(hmrc_sector_df)} rows") + logger.info( + f"Loaded VAT liability by sector data: {len(vat_liability_sector_df)} rows" + ) + logger.info( + f"Loaded VAT liability by band data: {len(vat_liability_band_df)} rows" + ) + + # Extract ONS total + ons_total_row = ons_df[ + ons_df["SIC Code"].isna() | (ons_df["SIC Code"] == "") + ] + if len(ons_total_row) > 0: + ons_total = ons_total_row.iloc[0]["Total"] + else: + sector_rows = ons_df[ + ~ons_df["Description"].str.contains("Total", na=False) + ] + ons_total = sector_rows["Total"].sum() + + logger.info(f"ONS total firms: {ons_total:,}") + + return ( + ons_df, + ons_employment_df, + hmrc_turnover_df, + hmrc_sector_df, + vat_liability_sector_df, + vat_liability_band_df, + ons_total, + ) + + def generate_base_firms( + self, ons_df: pd.DataFrame + ) -> Tuple[Tensor, Tensor]: + """Generate base firm records using efficient batch processing. + + Args: + ons_df: ONS turnover data + + Returns: + Tuple of (sic_codes, turnover_values) + """ + logger.info( + "Generating base firms using efficient batch processing..." + ) + + # ONS turnover band parameters (min, max, midpoint in £k) + band_params = { + "0-49": (0, 49, 24.5), + "50-99": (50, 99, 74.5), + "100-249": (100, 249, 174.5), + "250-499": (250, 499, 374.5), + "500-999": (500, 999, 749.5), + "1000-4999": (1000, 4999, 2999.5), + "5000+": (5000, 50000, 15000), + } + + all_sic_codes = [] + all_turnovers = [] + + # Process each sector + for _, row in ons_df.iterrows(): + sic_code = row["SIC Code"] + + # Skip summary rows + if pd.isna(sic_code) or sic_code == "" or sic_code == "Total": + continue + + sic_formatted = str(int(sic_code)).zfill(5) + + # Generate firms for each turnover band + for band, (min_val, max_val, midpoint) in band_params.items(): + if band in row and pd.notna(row[band]) and row[band] > 0: + count = int(row[band]) + + if count > 0: + # Generate turnover values with noise smoothing + turnovers = self._generate_turnover_values( + band, count, min_val, max_val, midpoint + ) + + # Store results + all_sic_codes.extend([sic_formatted] * count) + all_turnovers.extend(turnovers.cpu().numpy()) + + # Convert to efficient data structures + sic_codes_tensor = torch.tensor( + [int(sic) for sic in all_sic_codes], + dtype=torch.int64, + device=self.device, + ) + turnover_tensor = torch.tensor( + all_turnovers, dtype=torch.float32, device=self.device + ) + + logger.info(f"Generated {len(all_sic_codes):,} base firms") + + return sic_codes_tensor, turnover_tensor + + def _generate_turnover_values( + self, + band_name: str, + firm_count: int, + min_turnover: float, + max_turnover: float, + midpoint_turnover: float, + ) -> Tensor: + """Generate turnover values within a band with noise smoothing. + + Args: + band_name: ONS turnover band name (e.g., '0-49', '50-99') + firm_count: Number of firms to generate in this band + min_turnover: Minimum turnover value (£k) + max_turnover: Maximum turnover value (£k) + midpoint_turnover: Midpoint turnover for distribution (£k) + + Returns: + Generated turnover values with noise smoothing applied + """ + if firm_count == 0: + return torch.empty(0, device=self.device) + + # Generate turnover values with noise smoothing across the full band range + band_width = max_turnover - min_turnover + + # Use uniform noise for distribution smoothing + noise_std = max( + 25.0, band_width * 0.2 + ) # Uniform noise: 25k minimum or 20% of band width + + # Generate uniform base values across the full band range + uniform_values = torch.rand(firm_count, device=self.device) + base_turnover_values = min_turnover + uniform_values * band_width + + # Add Gaussian noise for smoothing + noise = torch.normal(0, noise_std, (firm_count,), device=self.device) + turnover_values = base_turnover_values + noise + + # Ensure all turnover values are positive + turnover_values = torch.clamp(turnover_values, min=0.1) + + return turnover_values + + def assign_vat_registration_flags( + self, turnover_values: Tensor, hmrc_bands: Dict[str, int] + ) -> Tensor: + """Assign VAT registration flags to identify HMRC-visible firms. + + VAT registration triggers: + 1. Mandatory: Annual turnover > £85k + 2. Voluntary: Random subset of firms below threshold (calculated from HMRC data) + + Args: + turnover_values: Array of turnover values + hmrc_bands: HMRC band data to calculate voluntary rate + + Returns: + Boolean array indicating VAT registration status + """ + logger.info("Assigning VAT registration flags...") + + # Calculate voluntary VAT rate from Target/Synthetic ratio for £1_to_Threshold + hmrc_target_1_to_threshold = hmrc_bands[ + "£1_to_Threshold" + ] # HMRC target: 678,350 + synthetic_1_to_threshold = ( + ((turnover_values > 0) & (turnover_values <= 85.0)).sum().item() + ) # Current synthetic count + voluntary_rate = ( + hmrc_target_1_to_threshold / synthetic_1_to_threshold + if synthetic_1_to_threshold > 0 + else 0.15 + ) + + logger.info( + f"Calculated voluntary VAT rate: {voluntary_rate:.3f} (Target: {hmrc_target_1_to_threshold:,} / Synthetic: {synthetic_1_to_threshold:,})" + ) + + # Mandatory registration above threshold + mandatory_vat = turnover_values > 85.0 + + # Voluntary registration below threshold (but above 0) + below_threshold = (turnover_values > 0) & (turnover_values <= 85.0) + n_below_threshold = below_threshold.sum().item() + + if n_below_threshold > 0: + # Random selection for voluntary VAT registration using calculated rate + voluntary_mask = ( + torch.rand(len(turnover_values), device=self.device) + < voluntary_rate + ) + voluntary_vat = below_threshold & voluntary_mask + else: + voluntary_vat = torch.zeros_like(below_threshold) + + vat_registered = mandatory_vat | voluntary_vat + + logger.info( + f"VAT registration: {mandatory_vat.sum():.0f} mandatory + {voluntary_vat.sum():.0f} voluntary = {vat_registered.sum():.0f} total" + ) + logger.info(f"Non-VAT registered: {(~vat_registered).sum():.0f} firms") + + return vat_registered + + def create_comprehensive_target_matrix( + self, + turnover_values: Tensor, + sic_codes: Tensor, + input_values: Tensor, + hmrc_bands: Dict[str, int], + hmrc_sector_df: pd.DataFrame, + ons_employment_df: pd.DataFrame, + vat_liability_sector_df: pd.DataFrame, + vat_liability_band_df: pd.DataFrame, + ons_total: int, + ) -> Tuple[Tensor, Tensor]: + """Create comprehensive target matrix for calibration. + + Creates targets for all HMRC turnover bands, sector targets, and VAT liability targets. + The optimization will determine which firms contribute to which targets + based on VAT registration flags. + + Args: + turnover_values: Array of turnover values + sic_codes: Array of SIC codes + input_values: Array of input values + hmrc_bands: Dictionary of all HMRC targets by band + hmrc_sector_df: HMRC sector data for ratio targets + ons_employment_df: ONS employment data for ratio targets + vat_liability_sector_df: VAT liability data by sector + vat_liability_band_df: VAT liability data by turnover band + ons_total: Total firm count target from ONS + + Returns: + Tuple of (target_matrix, target_values) + """ + logger.info("Creating comprehensive target matrix for calibration...") + + n_firms = len(turnover_values) + # Get sector data and calculate ratios + sector_rows = hmrc_sector_df[ + hmrc_sector_df["Trade_Sector"] != "Total" + ].copy() + hmrc_total = hmrc_sector_df[hmrc_sector_df["Trade_Sector"] == "Total"][ + "2023-24" + ].iloc[0] + n_sectors = len(sector_rows) + + # Get VAT liability sector data (excluding total) + vat_liability_sector_rows = vat_liability_sector_df[ + vat_liability_sector_df["Trade_Sector"] != "Total" + ].copy() + n_vat_sectors = len(vat_liability_sector_rows) + + # Get VAT liability by turnover band data + vat_liability_band_latest = vat_liability_band_df.iloc[ + -1 + ] # Get 2023-24 data + vat_liability_bands = { + "Negative_or_Zero": vat_liability_band_latest["Negative_or_Zero"], + "£1_to_Threshold": vat_liability_band_latest["£1_to_Threshold"], + "£Threshold_to_£150k": vat_liability_band_latest[ + "£Threshold_to_£150k" + ], + "£150k_to_£300k": vat_liability_band_latest["£150k_to_£300k"], + "£300k_to_£500k": vat_liability_band_latest["£300k_to_£500k"], + "£500k_to_£1m": vat_liability_band_latest["£500k_to_£1m"], + "£1m_to_£10m": vat_liability_band_latest["£1m_to_£10m"], + "Greater_than_£10m": vat_liability_band_latest[ + "Greater_than_£10m" + ], + } + n_vat_bands = 7 # Number of VAT liability bands (excluding Total and Negative_or_Zero) + + # Get employment data and calculate ratios + emp_bands = [ + "0-4", + "5-9", + "10-19", + "20-49", + "50-99", + "100-249", + "250+", + ] + n_employment_bands = len(emp_bands) + + n_targets = ( + 7 + n_sectors + n_employment_bands + n_vat_sectors + n_vat_bands + ) # 7 turnover + sector + employment + VAT liability by sector + VAT liability by band + + # Initialize target matrix + target_matrix = torch.zeros(n_targets, n_firms, device=self.device) + + # Map all firms to HMRC bands + all_band_indices = self._map_to_hmrc_bands(turnover_values) + + # Rows 0-6: Turnover band targets + # Row 0: £1_to_Threshold - keep ONS structure + firms_in_threshold = all_band_indices == 1 + target_matrix[0, firms_in_threshold] = 1.0 + + # Row 1: £Threshold_to_£150k - calibrate to HMRC target + firms_150k = all_band_indices == 2 + target_matrix[1, firms_150k] = 1.0 + + # Rows 2-6: Individual HMRC bands above £150k + for i, band_idx in enumerate( + [3, 4, 5, 6, 7], start=2 + ): # £150k_to_£300k, etc. + firms_in_band = all_band_indices == band_idx + target_matrix[i, firms_in_band] = 1.0 + + # Rows 7 to 7+n_sectors-1: Sector targets (for VAT-registered firms only) + for i, (_, sector_row) in enumerate(sector_rows.iterrows(), start=7): + trade_sector = sector_row["Trade_Sector"] + # Convert HMRC Trade_Sector to SIC code (00001 -> 1, 00002 -> 2, etc.) + sic_code = int(trade_sector) + + # Find firms in this sector + firms_in_sector = sic_codes == sic_code + target_matrix[i, firms_in_sector] = 1.0 + + # Generate employment assignments for target matrix (temporary assignments for optimization) + employment_values = self.assign_employment(n_firms, ons_employment_df) + + # Map employment to bands + def map_employment_to_band_idx(emp_val): + if emp_val <= 4: + return 0 # '0-4' + elif emp_val <= 9: + return 1 # '5-9' + elif emp_val <= 19: + return 2 # '10-19' + elif emp_val <= 49: + return 3 # '20-49' + elif emp_val <= 99: + return 4 # '50-99' + elif emp_val <= 249: + return 5 # '100-249' + else: + return 6 # '250+' + + employment_band_indices = torch.tensor( + [ + map_employment_to_band_idx(emp.item()) + for emp in employment_values + ], + dtype=torch.long, + device=self.device, + ) + + # Rows 7+n_sectors to 7+n_sectors+n_employment_bands-1: Employment ratio targets + emp_start_row = 7 + n_sectors + for band_idx, _ in enumerate(emp_bands): + row_idx = emp_start_row + band_idx + firms_in_emp_band = employment_band_indices == band_idx + target_matrix[row_idx, firms_in_emp_band] = 1.0 + + # Calculate VAT liability for each firm (output - input) + vat_liability_values = turnover_values - input_values # in £k + + # Rows 7+n_sectors+n_employment_bands to 7+n_sectors+n_employment_bands+n_vat_sectors-1: VAT liability targets by sector + vat_start_row = 7 + n_sectors + n_employment_bands + for i, (_, vat_row) in enumerate(vat_liability_sector_rows.iterrows()): + row_idx = vat_start_row + i + sic_code = int(vat_row["Trade_Sector"]) + + # Find firms in this sector and calculate their VAT liability contribution + firms_in_sector = sic_codes == sic_code + # Weight by VAT liability for this sector's target + target_matrix[row_idx, firms_in_sector] = vat_liability_values[ + firms_in_sector + ] + + # Rows 7+n_sectors+n_employment_bands+n_vat_sectors to end: VAT liability targets by turnover band (excluding Negative_or_Zero) + vat_band_start_row = 7 + n_sectors + n_employment_bands + n_vat_sectors + for i, band_name in enumerate( + [ + "£1_to_Threshold", + "£Threshold_to_£150k", + "£150k_to_£300k", + "£300k_to_£500k", + "£500k_to_£1m", + "£1m_to_£10m", + "Greater_than_£10m", + ] + ): + row_idx = vat_band_start_row + i + + # Map firms to this turnover band + if band_name == "£1_to_Threshold": + firms_in_band = (turnover_values > 0) & (turnover_values <= 85) + elif band_name == "£Threshold_to_£150k": + firms_in_band = (turnover_values > 85) & ( + turnover_values <= 150 + ) + elif band_name == "£150k_to_£300k": + firms_in_band = (turnover_values > 150) & ( + turnover_values <= 300 + ) + elif band_name == "£300k_to_£500k": + firms_in_band = (turnover_values > 300) & ( + turnover_values <= 500 + ) + elif band_name == "£500k_to_£1m": + firms_in_band = (turnover_values > 500) & ( + turnover_values <= 1000 + ) + elif band_name == "£1m_to_£10m": + firms_in_band = (turnover_values > 1000) & ( + turnover_values <= 10000 + ) + else: # Greater_than_£10m + firms_in_band = turnover_values > 10000 + + # Weight by VAT liability for this band's target + target_matrix[row_idx, firms_in_band] = vat_liability_values[ + firms_in_band + ] + + # Calculate targets + # £1_to_Threshold: Use ONS structure (current count from generation) + ons_threshold_count = firms_in_threshold.sum().item() + + # Turnover band targets (absolute numbers) + turnover_targets = [ + ons_threshold_count, # £1_to_Threshold: Keep ONS structure + hmrc_bands["£Threshold_to_£150k"], # £Threshold_to_£150k: HMRC + hmrc_bands["£150k_to_£300k"], # £150k_to_£300k: HMRC + hmrc_bands["£300k_to_£500k"], # £300k_to_£500k: HMRC + hmrc_bands["£500k_to_£1m"], # £500k_to_£1m: HMRC + hmrc_bands["£1m_to_£10m"], # £1m_to_£10m: HMRC + hmrc_bands["Greater_than_£10m"], # Greater_than_£10m: HMRC + ] + + # Sector targets (direct HMRC targets for VAT-registered firms) + sector_targets = [] + for _, sector_row in sector_rows.iterrows(): + sector_count = sector_row["2023-24"] # Direct HMRC target + sector_targets.append(sector_count) + + # Employment count targets (direct ONS employment counts) + employment_targets = [] + # Get ONS employment totals + ons_emp_totals = {} + for band in emp_bands: + if band in ons_employment_df.columns: + sector_rows_emp = ons_employment_df[ + ~ons_employment_df["Description"].str.contains( + "Total", na=False + ) + ] + ons_emp_totals[band] = sector_rows_emp[band].fillna(0).sum() + else: + ons_emp_totals[band] = 0 + + # Use direct ONS employment counts as targets + for band in emp_bands: + emp_count = ons_emp_totals[band] + employment_targets.append(emp_count) + + # VAT liability targets by sector (in millions £, convert to £k) + vat_liability_sector_targets = [] + for _, vat_row in vat_liability_sector_rows.iterrows(): + vat_liability_millions = vat_row["2023-24"] # in millions £ + vat_liability_k = vat_liability_millions * 1000 # convert to £k + vat_liability_sector_targets.append(vat_liability_k) + + # VAT liability targets by turnover band (in millions £, convert to £k) - excluding Negative_or_Zero + vat_liability_band_targets = [] + for band_name in [ + "£1_to_Threshold", + "£Threshold_to_£150k", + "£150k_to_£300k", + "£300k_to_£500k", + "£500k_to_£1m", + "£1m_to_£10m", + "Greater_than_£10m", + ]: + vat_liability_millions = vat_liability_bands[ + band_name + ] # in millions £ + vat_liability_k = vat_liability_millions * 1000 # convert to £k + vat_liability_band_targets.append(vat_liability_k) + + target_values_list = ( + turnover_targets + + sector_targets + + employment_targets + + vat_liability_sector_targets + + vat_liability_band_targets + ) + + target_values = torch.tensor( + target_values_list, dtype=torch.float32, device=self.device + ) + + logger.info(f"Target matrix shape: {target_matrix.shape}") + logger.info(f"Calibration targets (turnover + sector + employment):") + + # Log turnover targets + turnover_names = [ + "£1_to_Threshold (ONS)", + "£Threshold_to_£150k (HMRC)", + "£150k_to_£300k (HMRC)", + "£300k_to_£500k (HMRC)", + "£500k_to_£1m (HMRC)", + "£1m_to_£10m (HMRC)", + "Greater_than_£10m (HMRC)", + ] + for target_name, target_val in zip(turnover_names, turnover_targets): + logger.info(f" {target_name}: {target_val:,.0f}") + + # Log sector and employment targets (summary) + logger.info( + f" Sector targets: {n_sectors} sectors from HMRC data (VAT-registered)" + ) + logger.info( + f" Employment count targets: {n_employment_bands} bands from ONS data (direct counts)" + ) + logger.info( + f" VAT liability sector targets: {n_vat_sectors} sectors from HMRC data (in £millions)" + ) + logger.info( + f" VAT liability band targets: {n_vat_bands} bands from HMRC data (in £millions, 2x weight)" + ) + logger.info( + f" Total targets: {len(target_values_list)} (7 turnover + {n_sectors} sector + {n_employment_bands} employment + {n_vat_sectors} VAT liability sector + {n_vat_bands} VAT liability band)" + ) + logger.info(f" Negative_or_Zero: MANUAL (ONS doesn't have them)") + + return target_matrix, target_values + + def optimize_weights( + self, + target_matrix: Tensor, + target_values: Tensor, + n_iterations: int = 300, + lr: float = 0.01, + ) -> Tensor: + """Optimize weights to match multiple targets simultaneously. + + Uses symmetric relative error loss for robust calibration. + + Args: + target_matrix: Matrix where A[i,j] = contribution of firm j to target i + target_values: Vector of target values to match + n_iterations: Number of optimization iterations + lr: Learning rate + + Returns: + Optimized weights + """ + logger.info("Starting multi-objective weight optimization...") + + _, n_firms = target_matrix.shape + + # Initialize log-weights (ensures positive weights) + log_weights = torch.zeros( + n_firms, device=self.device, requires_grad=True + ) + optimizer = torch.optim.Adam([log_weights], lr=lr) + + best_loss = float("inf") + patience = 100 + patience_counter = 0 + + for iteration in range(n_iterations): + optimizer.zero_grad() + + # Convert to positive weights + weights = torch.exp(log_weights) + + # Apply 15% dropout during training + if log_weights.requires_grad: # Only during training + dropout_mask = ( + torch.rand_like(weights) > 0.05 + ) # Keep 95%, drop 15% + weights = weights * dropout_mask + + # Calculate predictions: target_matrix @ weights + predictions = torch.matmul(target_matrix, weights) + + # Symmetric relative error loss (robust to different target scales) + epsilon = 1e-6 + pred_adj = predictions + epsilon + target_adj = target_values + epsilon + + # Symmetric relative error: min(|pred/target - 1|^2, |target/pred - 1|^2) + error_1 = ((pred_adj / target_adj) - 1) ** 2 + error_2 = ((target_adj / pred_adj) - 1) ** 2 + sre_loss = torch.minimum(error_1, error_2) + + # Apply importance weights based on target type + importance_weights = torch.ones_like(sre_loss) + importance_weights[:7] = 5.0 # 5x weight for turnover targets + + # Calculate indices for different target types + # Structure: 7 turnover + n_sectors + n_employment_bands + n_vat_sectors + n_vat_bands + n_total_targets = len(sre_loss) + + # We need to dynamically calculate the section sizes from the actual target matrix + # For now, use fixed estimates based on typical data structure + n_employment = 7 # Fixed: 7 employment bands + + # The remaining targets are split between sectors and VAT liability + n_remaining = ( + n_total_targets - 7 - n_employment + ) # Remove turnover and employment + n_est_sectors = ( + n_remaining // 3 + ) # Rough estimate (sectors, vat_sectors, vat_bands) + n_est_vat_sectors = n_est_sectors + n_est_vat_bands = n_remaining - n_est_sectors - n_est_vat_sectors + + sector_start_idx = 7 + emp_start_idx = 7 + n_est_sectors + vat_sector_start_idx = emp_start_idx + n_employment + vat_band_start_idx = vat_sector_start_idx + n_est_vat_sectors + + # Set importance weights + importance_weights[sector_start_idx:emp_start_idx] = ( + 1.0 # 1x weight for sector targets + ) + importance_weights[emp_start_idx:vat_sector_start_idx] = ( + 1.0 # 1x weight for employment targets + ) + importance_weights[vat_sector_start_idx:vat_band_start_idx] = ( + 1.0 # 1x weight for VAT liability sector targets + ) + importance_weights[vat_band_start_idx:] = ( + 2.0 # 2x weight for VAT liability band targets + ) + + weighted_loss = sre_loss * importance_weights + total_loss = torch.mean(weighted_loss) + + # Add regularization to prevent extreme weights + reg_loss = 0.01 * torch.mean(torch.abs(log_weights)) + total_loss += reg_loss + + total_loss.backward() + + # Gradient clipping for stability + torch.nn.utils.clip_grad_norm_([log_weights], max_norm=1.0) + + optimizer.step() + + # Early stopping + if total_loss.item() < best_loss: + best_loss = total_loss.item() + patience_counter = 0 + else: + patience_counter += 1 + + if iteration % 100 == 0: + logger.info( + f"Iteration {iteration}: Loss = {total_loss.item():.6f}" + ) + + if patience_counter > patience: + logger.info(f"Early stopping at iteration {iteration}") + break + + final_weights = torch.exp(log_weights) + final_predictions = torch.matmul(target_matrix, final_weights) + + logger.info("Optimization complete:") + target_names = [ + "Negative_or_Zero", + "£1_to_Threshold", + "£Threshold_to_£150k", + "£150k_to_£300k", + "£300k_to_£500k", + "£500k_to_£1m", + "£1m_to_£10m", + "Greater_than_£10m", + ] + for i, (pred, target, name) in enumerate( + zip(final_predictions, target_values, target_names) + ): + if target > 0: # Only log targets we're actually trying to match + accuracy = 1 - abs(pred - target) / target + logger.info( + f" {name}: {pred:.0f} vs {target:.0f} ({accuracy:.1%})" + ) + else: + logger.info(f" {name}: SKIPPED (pred: {pred:.0f})") + + return final_weights.detach() + + def apply_final_calibration( + self, + base_sic_codes: Tensor, + base_turnover: Tensor, + base_input: Tensor, + weights_tensor: Tensor, + hmrc_bands: Dict[str, int], + ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: + """Apply final calibration adjustments. + + Key calibration steps: + 1. Keep ALL base firms (no sampling) + 2. Add zero/negative turnover firms manually + 3. Apply calibration weights to match targets + + Args: + base_sic_codes: Original base SIC codes + base_turnover: Original base turnover values + base_input: Original base input values + weights_tensor: Calibration weights from optimization + hmrc_bands: HMRC target bands + + Returns: + Tuple of final (sic_codes, turnover, input, weights) tensors + """ + logger.info("Applying final calibration adjustments...") + + all_final_firms = [] + + # Step 1: Add ALL base firms with their weights + hmrc_band_indices = self._map_to_hmrc_bands(base_turnover) + + logger.info(f"Adding {len(base_sic_codes):,} base firms...") + for i in range(len(base_sic_codes)): + band_idx = hmrc_band_indices[i].item() + + # Use calibrated weights from optimization for all bands + weight = weights_tensor[i].item() + + all_final_firms.append( + { + "sic_code": base_sic_codes[i].item(), + "annual_turnover_k": base_turnover[i].item(), + "annual_input_k": base_input[i].item(), + "weight": weight, + } + ) + + # Step 2: Manually add zero/negative turnover firms + negative_zero_target = hmrc_bands["Negative_or_Zero"] + + if negative_zero_target > 0: + # Get current sector distribution for proportional allocation + unique_sics, counts = torch.unique( + base_sic_codes, return_counts=True + ) + total_current_firms = len(base_sic_codes) + + logger.info( + f"Manually adding {negative_zero_target:,} zero turnover firms from HMRC..." + ) + zero_firms_count = 0 + + for sic, count in zip(unique_sics, counts): + sector_weight = count.float() / total_current_firms + firms_for_sector = int( + (negative_zero_target * sector_weight).item() + ) + if firms_for_sector > 0: + for _ in range(firms_for_sector): + all_final_firms.append( + { + "sic_code": sic.item(), + "annual_turnover_k": 0.0, + "annual_input_k": 0.0, + "weight": 1.0, + } + ) + zero_firms_count += 1 + + logger.info(f"Added {zero_firms_count:,} firms with zero turnover") + + # Convert back to tensors + if all_final_firms: + final_sics = torch.tensor( + [f["sic_code"] for f in all_final_firms], + dtype=torch.int64, + device=self.device, + ) + final_turnover = torch.tensor( + [f["annual_turnover_k"] for f in all_final_firms], + dtype=torch.float32, + device=self.device, + ) + final_input = torch.tensor( + [f["annual_input_k"] for f in all_final_firms], + dtype=torch.float32, + device=self.device, + ) + final_weights = torch.tensor( + [f["weight"] for f in all_final_firms], + dtype=torch.float32, + device=self.device, + ) + + logger.info(f"Final dataset: {len(all_final_firms):,} firms") + logger.info( + f"Total weighted population: {final_weights.sum():.0f}" + ) + else: + # Empty tensors if no firms generated + final_sics = torch.empty(0, dtype=torch.int64, device=self.device) + final_turnover = torch.empty( + 0, dtype=torch.float32, device=self.device + ) + final_input = torch.empty( + 0, dtype=torch.float32, device=self.device + ) + final_weights = torch.empty( + 0, dtype=torch.float32, device=self.device + ) + + return final_sics, final_turnover, final_input, final_weights + + def _map_to_hmrc_bands(self, turnover_values: Tensor) -> Tensor: + """Map turnover values to HMRC band indices. + + Args: + turnover_values: Array of turnover values + + Returns: + HMRC band indices (0-7 for 8 HMRC bands) + """ + # Initialize with highest band (Greater_than_£10m = 7) + band_indices = torch.full_like(turnover_values, 7, dtype=torch.long) + + # Assign bands based on turnover thresholds + band_indices = torch.where( + turnover_values <= 0, 0, band_indices + ) # Negative_or_Zero + band_indices = torch.where( + (turnover_values > 0) & (turnover_values <= 85), 1, band_indices + ) # £1_to_Threshold + band_indices = torch.where( + (turnover_values > 85) & (turnover_values <= 150), 2, band_indices + ) # £Threshold_to_£150k + band_indices = torch.where( + (turnover_values > 150) & (turnover_values <= 300), 3, band_indices + ) # £150k_to_£300k + band_indices = torch.where( + (turnover_values > 300) & (turnover_values <= 500), 4, band_indices + ) # £300k_to_£500k + band_indices = torch.where( + (turnover_values > 500) & (turnover_values <= 1000), + 5, + band_indices, + ) # £500k_to_£1m + band_indices = torch.where( + (turnover_values > 1000) & (turnover_values <= 10000), + 6, + band_indices, + ) # £1m_to_£10m + + return band_indices + + def _print_validation_section(self, title: str, width: int = 65): + """Print a formatted validation section header.""" + print(f"\n{title}:") + print("-" * width) + + def _print_accuracy_breakdown( + self, accuracies: list, n_items: int, label: str + ): + """Print standardized accuracy breakdown.""" + accuracy_95_plus = sum(1 for acc in accuracies if acc >= 0.95) + accuracy_90_95 = sum(1 for acc in accuracies if 0.90 <= acc < 0.95) + accuracy_80_90 = sum(1 for acc in accuracies if 0.80 <= acc < 0.90) + accuracy_below_80 = sum(1 for acc in accuracies if acc < 0.80) + + overall_accuracy = np.mean(accuracies) if accuracies else 0.0 + print(f"{label} OVERALL ACCURACY: {overall_accuracy:.1%}") + print( + f"Accuracy breakdown: ≥95%: {accuracy_95_plus}/{n_items}, " + f"90-95%: {accuracy_90_95}/{n_items}, " + f"80-90%: {accuracy_80_90}/{n_items}, " + f"<80%: {accuracy_below_80}/{n_items}" + ) + return overall_accuracy + + def generate_input_values( + self, turnover_values: Tensor, sic_codes: Tensor + ) -> Tensor: + """Generate input values for firms directly from distributions. + + Input values are drawn from beta distributions that allow for: + - Most firms: inputs 60-95% of turnover (positive VAT liability) + - Some firms: inputs > turnover (negative VAT liability) + + Args: + turnover_values: Array of turnover values (output) + sic_codes: Array of SIC codes (used for sector-specific variations) + + Returns: + Array of input values in £k + """ + logger.info("Generating input values for firms from distributions...") + + n_firms = len(turnover_values) + input_values = torch.zeros_like(turnover_values) + + # Generate input/output ratios from a beta distribution + # Beta(4, 2) gives a distribution centered around 0.67 with range [0, 1] + # We'll scale and shift this to get our desired range + alpha, beta = 4.0, 2.0 + base_ratios = ( + torch.distributions.Beta(alpha, beta) + .sample((n_firms,)) + .to(self.device) + ) + + # Scale to range [0.3, 1.3] to allow both positive and negative VAT liability + # Most values will be in [0.6, 0.95] range + scaled_ratios = 0.3 + base_ratios * 1.0 # Maps [0,1] to [0.3, 1.3] + + # Add sector-specific noise + sector_noise = torch.randn(n_firms, device=self.device) * 0.15 + + # For some sectors, shift the distribution to make negative liability more likely + for i in range(n_firms): + sic = sic_codes[i].item() + + # Add extra variation for certain sectors that tend to have extreme values + if sic in [ + 1, + 3, + 6, + 7, + 9, + 10, + 24, + 30, + 36, + 37, + 49, + 50, + 51, + 60, + 64, + 79, + 84, + ]: + # These sectors often have negative VAT liability + # Add positive bias to input ratio + scaled_ratios[i] += ( + torch.rand(1, device=self.device).item() * 0.3 + ) + elif sic in [11, 12, 69, 70, 78]: + # These sectors often have high VAT liability (low inputs) + # Add negative bias to input ratio + scaled_ratios[i] -= ( + torch.rand(1, device=self.device).item() * 0.2 + ) + + # Apply ratios with noise + final_ratios = torch.clamp(scaled_ratios + sector_noise, 0.1, 1.5) + + # Generate input values + for i in range(n_firms): + if turnover_values[i] <= 0: + input_values[i] = 0 + else: + input_values[i] = turnover_values[i] * final_ratios[i] + + logger.info(f"Generated input values for {n_firms:,} firms") + logger.info(f"Input/output ratio statistics:") + logger.info(f" Mean: {final_ratios.mean():.2%}") + logger.info(f" Std: {final_ratios.std():.2%}") + logger.info(f" Min: {final_ratios.min():.2%}") + logger.info(f" Max: {final_ratios.max():.2%}") + logger.info( + f" Firms with negative VAT liability (input > output): {(final_ratios > 1.0).sum().item():,}" + ) + + return input_values + + def assign_employment( + self, num_firms: int, ons_employment_df: pd.DataFrame + ) -> Tensor: + """Assign employment using ONS distribution. + + Args: + num_firms: Number of firms to assign employment to + ons_employment_df: ONS employment data + + Returns: + Array of employment values + """ + logger.info("Assigning employment using ONS distribution...") + + # Employment bands and parameters + emp_bands = [ + "0-4", + "5-9", + "10-19", + "20-49", + "50-99", + "100-249", + "250+", + ] + band_params = { + "0-4": (1, 4, 2.5), + "5-9": (5, 9, 7), + "10-19": (10, 19, 14.5), + "20-49": (20, 49, 34.5), + "50-99": (50, 99, 74.5), + "100-249": (100, 249, 174.5), + "250+": (250, 2000, 400), + } + + # Calculate ONS employment band totals + total_ons_counts = {} + for band in emp_bands: + if band in ons_employment_df.columns: + sector_rows = ons_employment_df[ + ~ons_employment_df["Description"].str.contains( + "Total", na=False + ) + ] + total_ons_counts[band] = int(sector_rows[band].fillna(0).sum()) + else: + total_ons_counts[band] = 0 + + total_ons_firms = sum(total_ons_counts.values()) + + # Calculate target counts for each band + employment_values = [] + for band in emp_bands: + target_count = int( + round(num_firms * total_ons_counts[band] / total_ons_firms) + ) + if target_count > 0: + min_val, max_val, midpoint = band_params[band] + + # Generate employment values + if band == "0-4": + # Uniform for micro businesses + values = torch.randint( + 1, 5, (target_count,), device=self.device + ) + elif band == "250+": + # Log-normal for large firms + log_mean = torch.log( + torch.tensor(midpoint, device=self.device) + ) + values = torch.normal( + log_mean, 0.8, (target_count,), device=self.device + ).exp() + values = torch.clamp(values, min_val, max_val).round() + else: + # Beta distribution for others + uniform = torch.rand(target_count, device=self.device) + beta_values = uniform.pow(0.5) * (1 - uniform).pow( + 2.0 + ) # Beta approximation + values = min_val + beta_values * (max_val - min_val) + values = values.round() + + employment_values.extend(values.cpu().numpy()) + + # Shuffle and pad/trim to exact size + np.random.shuffle(employment_values) + if len(employment_values) < num_firms: + employment_values.extend( + [1] * (num_firms - len(employment_values)) + ) + elif len(employment_values) > num_firms: + employment_values = employment_values[:num_firms] + + employment_array = torch.tensor( + employment_values, dtype=torch.float32, device=self.device + ) + + logger.info(f"Assigned employment to {num_firms:,} firms") + + return employment_array + + def validate_comprehensive_accuracy( + self, + synthetic_df: pd.DataFrame, + hmrc_target_bands: Dict[str, int], + ons_total_target: int, + ons_employment_df: pd.DataFrame, + hmrc_sector_df: pd.DataFrame, + vat_liability_df: pd.DataFrame, + vat_liability_band_df: pd.DataFrame, + ) -> Tuple[float, float, float, float, float, float]: + """Validate synthetic data against official data sources. + + Args: + synthetic_df: Generated synthetic data with VAT flags + hmrc_target_bands: HMRC VAT-registered firm targets by turnover band + ons_total_target: ONS total firm count target + ons_employment_df: ONS employment data for validation + hmrc_sector_df: HMRC sector data for validation + vat_liability_df: VAT liability data for validation + + Returns: + Tuple of (hmrc_accuracy, ons_accuracy, employment_accuracy, sector_accuracy, vat_liability_accuracy) + """ + logger.info( + "Validating synthetic data against official data sources..." + ) + + # 1. HMRC VAT Firm Validation (VAT-registered firms only) + def map_to_hmrc_band(turnover_k): + if turnover_k <= 0: + return "Negative_or_Zero" + elif turnover_k <= 85: + return "£1_to_Threshold" + elif turnover_k <= 150: + return "£Threshold_to_£150k" + elif turnover_k <= 300: + return "£150k_to_£300k" + elif turnover_k <= 500: + return "£300k_to_£500k" + elif turnover_k <= 1000: + return "£500k_to_£1m" + elif turnover_k <= 10000: + return "£1m_to_£10m" + else: + return "Greater_than_£10m" + + # Map all firms to HMRC bands for validation + synthetic_df["hmrc_band"] = synthetic_df["annual_turnover_k"].apply( + map_to_hmrc_band + ) + all_bands = ( + synthetic_df.groupby("hmrc_band")["weight"] + .sum() + .round() + .astype(int) + ) + + # === VAT REGISTRATION VALIDATION === + vat_registered_count = synthetic_df[ + synthetic_df["vat_registered"] == True + ]["weight"].sum() + hmrc_total_vat = sum(hmrc_target_bands.values()) + vat_accuracy = ( + 1 - abs(vat_registered_count - hmrc_total_vat) / hmrc_total_vat + ) + + self._print_validation_section("VAT REGISTRATION VALIDATION", 80) + print(f"VAT Registered (weighted): {vat_registered_count:,.0f}") + print(f"HMRC Total VAT firms: {hmrc_total_vat:,}") + print( + f"Difference: {vat_registered_count - hmrc_total_vat:+,.0f}" + ) + print(f"VAT Registration Accuracy: {vat_accuracy:.1%}") + + # === TURNOVER BAND VALIDATION === + self._print_validation_section("TURNOVER BAND VALIDATION", 80) + print( + f"{'Band':>25} {'Synthetic':>12} {'Target':>12} {'Source':>8} {'Accuracy':>10}" + ) + print("-" * 75) + + hmrc_band_accuracies = [] + + # Validate each HMRC band + for band_name, target_count in hmrc_target_bands.items(): + synthetic_count = all_bands.get(band_name, 0) + accuracy = ( + 1 - abs(synthetic_count - target_count) / target_count + if target_count > 0 + else (1.0 if synthetic_count == 0 else 0.0) + ) + + # Special handling for £1_to_Threshold (ONS-based, not HMRC target) + if band_name == "£1_to_Threshold": + status = "○" + source = "ONS-based" + else: + hmrc_band_accuracies.append(accuracy) + status = ( + "✓" if accuracy > 0.90 else "⚠" if accuracy > 0.80 else "✗" + ) + source = "HMRC" + + print( + f" {status} {band_name:>22}: {synthetic_count:>10,} vs {target_count:>10,} {source:>8} ({accuracy:>6.1%})" + ) + + hmrc_accuracy = ( + np.mean(hmrc_band_accuracies) if hmrc_band_accuracies else 0.0 + ) + print("-" * 75) + print(f"HMRC CALIBRATION ACCURACY: {hmrc_accuracy:.1%}") + + # === ONS POPULATION VALIDATION === + total_synthetic_weighted = synthetic_df["weight"].sum() + ons_population_accuracy = ( + 1 + - abs(total_synthetic_weighted - ons_total_target) + / ons_total_target + ) + + self._print_validation_section("ONS POPULATION VALIDATION") + print(f"Synthetic Total: {total_synthetic_weighted:,.0f}") + print(f"ONS Target: {ons_total_target:,}") + print( + f"Difference: {total_synthetic_weighted - ons_total_target:+,.0f}" + ) + print(f"ONS Accuracy: {ons_population_accuracy:.1%}") + + # === EMPLOYMENT BAND VALIDATION === + def _map_employment_to_band(employment: int) -> str: + """Map employment count to ONS employment band.""" + if employment <= 4: + return "0-4" + elif employment <= 9: + return "5-9" + elif employment <= 19: + return "10-19" + elif employment <= 49: + return "20-49" + elif employment <= 99: + return "50-99" + elif employment <= 249: + return "100-249" + else: + return "250+" + + employment_bands = [ + "0-4", + "5-9", + "10-19", + "20-49", + "50-99", + "100-249", + "250+", + ] + ons_employment_targets = {} + + # Get ONS employment targets + for band in employment_bands: + if band in ons_employment_df.columns: + sector_rows = ons_employment_df[ + ~ons_employment_df["Description"].str.contains( + "Total", na=False + ) + ] + ons_employment_targets[band] = ( + sector_rows[band].fillna(0).sum() + ) + else: + ons_employment_targets[band] = 0 + + # Calculate synthetic employment distribution + synthetic_df["employment_band"] = synthetic_df["employment"].apply( + _map_employment_to_band + ) + synthetic_employment_counts = ( + synthetic_df.groupby("employment_band")["weight"] + .sum() + .round() + .astype(int) + ) + + self._print_validation_section("EMPLOYMENT BAND VALIDATION") + print( + f"{'Band':>8} {'Synthetic':>10} {'ONS Target':>11} {'Accuracy':>10}" + ) + print("-" * 65) + + employment_accuracies = [] + for band in employment_bands: + ons_target = ons_employment_targets.get(band, 0) + synthetic_count = synthetic_employment_counts.get(band, 0) + accuracy = ( + 1 - abs(synthetic_count - ons_target) / ons_target + if ons_target > 0 + else (1.0 if synthetic_count == 0 else 0.0) + ) + employment_accuracies.append(accuracy) + + status = ( + "✓" if accuracy > 0.90 else "⚠" if accuracy > 0.80 else "✗" + ) + print( + f" {status} {band:>6}: {synthetic_count:>8,} vs {ons_target:>9,} ({accuracy:>6.1%})" + ) + + print("-" * 65) + employment_accuracy = self._print_accuracy_breakdown( + employment_accuracies, len(employment_bands), "EMPLOYMENT" + ) + + # === SECTOR VALIDATION (VAT-registered firms only) === + sector_rows = hmrc_sector_df[ + hmrc_sector_df["Trade_Sector"] != "Total" + ].copy() + + # Calculate synthetic VAT-registered sector distribution + synthetic_df["sic_numeric"] = synthetic_df["sic_code"].astype(int) + vat_registered_firms = synthetic_df[ + synthetic_df["vat_registered"] == True + ] + synthetic_vat_sector_counts = ( + vat_registered_firms.groupby("sic_numeric")["weight"] + .sum() + .round() + .astype(int) + ) + + self._print_validation_section( + "SECTOR VALIDATION (VAT-registered firms)" + ) + print( + f"{'SIC':>5} {'Synthetic VAT':>12} {'HMRC Target':>12} {'Accuracy':>10}" + ) + print("-" * 65) + + sector_accuracies = [] + for _, sector_row in sector_rows.iterrows(): + sic_code = int(sector_row["Trade_Sector"]) + hmrc_target = sector_row["2023-24"] + + synthetic_vat_count = synthetic_vat_sector_counts.get(sic_code, 0) + accuracy = ( + 1 - abs(synthetic_vat_count - hmrc_target) / hmrc_target + if hmrc_target > 0 + else (1.0 if synthetic_vat_count == 0 else 0.0) + ) + sector_accuracies.append(accuracy) + + status = ( + "✓" if accuracy > 0.90 else "⚠" if accuracy > 0.80 else "✗" + ) + print( + f" {status} {sic_code:>3}: {synthetic_vat_count:>10,} vs {hmrc_target:>10,} ({accuracy:>6.1%})" + ) + + print("-" * 65) + sector_accuracy = self._print_accuracy_breakdown( + sector_accuracies, len(sector_accuracies), "SECTOR" + ) + + # === VAT LIABILITY VALIDATION === + # Get VAT liability sector data (excluding total) + vat_liability_rows = vat_liability_df[ + vat_liability_df["Trade_Sector"] != "Total" + ].copy() + + # Calculate synthetic VAT liability by sector + synthetic_df["vat_liability_k"] = ( + synthetic_df["annual_turnover_k"] - synthetic_df["annual_input_k"] + ) + synthetic_df["sic_numeric"] = synthetic_df["sic_code"].astype(int) + + # Group by sector and calculate weighted VAT liability + synthetic_vat_liability = ( + synthetic_df.groupby("sic_numeric") + .apply(lambda x: (x["vat_liability_k"] * x["weight"]).sum()) + .reset_index(name="synthetic_vat_liability_k") + ) + + self._print_validation_section("VAT LIABILITY VALIDATION (by sector)") + print( + f"{'SIC':>5} {'Synthetic (£m)':>14} {'Target (£m)':>12} {'Accuracy':>10}" + ) + print("-" * 65) + + vat_liability_accuracies = [] + for _, vat_row in vat_liability_rows.iterrows(): + sic_code = int(vat_row["Trade_Sector"]) + target_millions = vat_row["2023-24"] # in millions £ + target_k = target_millions * 1000 # convert to £k + + # Get synthetic VAT liability for this sector + sector_data = synthetic_vat_liability[ + synthetic_vat_liability["sic_numeric"] == sic_code + ] + if not sector_data.empty: + synthetic_k = sector_data["synthetic_vat_liability_k"].iloc[0] + else: + synthetic_k = 0 + + synthetic_millions = ( + synthetic_k / 1000 + ) # convert back to millions for display + + # Calculate accuracy (handle negative targets) + if abs(target_millions) > 0.1: # Skip near-zero targets + # For negative targets, check if signs match and relative magnitude + if target_millions < 0 and synthetic_millions < 0: + # Both negative - check relative difference + accuracy = 1 - min( + abs(synthetic_millions - target_millions) + / abs(target_millions), + 1.0, + ) + elif target_millions > 0 and synthetic_millions > 0: + # Both positive - check relative difference + accuracy = 1 - min( + abs(synthetic_millions - target_millions) + / target_millions, + 1.0, + ) + else: + # Sign mismatch - poor accuracy + accuracy = max( + 0, + 1 + - abs(synthetic_millions - target_millions) + / max(abs(target_millions), 1), + ) + else: + accuracy = 1.0 if abs(synthetic_millions) < 1 else 0.0 + + vat_liability_accuracies.append(accuracy) + + status = ( + "✓" if accuracy > 0.70 else "⚠" if accuracy > 0.50 else "✗" + ) + print( + f" {status} {sic_code:>3}: {synthetic_millions:>12.1f} vs {target_millions:>10.1f} ({accuracy:>6.1%})" + ) + + print("-" * 65) + vat_liability_sector_accuracy = self._print_accuracy_breakdown( + vat_liability_accuracies, + len(vat_liability_accuracies), + "VAT LIABILITY BY SECTOR", + ) + + # === VAT LIABILITY BY TURNOVER BAND VALIDATION === + # Get VAT liability by turnover band targets (excluding Negative_or_Zero) + vat_liability_band_latest = vat_liability_band_df.iloc[ + -1 + ] # Get 2023-24 data + vat_liability_band_targets = { + "£1_to_Threshold": vat_liability_band_latest["£1_to_Threshold"], + "£Threshold_to_£150k": vat_liability_band_latest[ + "£Threshold_to_£150k" + ], + "£150k_to_£300k": vat_liability_band_latest["£150k_to_£300k"], + "£300k_to_£500k": vat_liability_band_latest["£300k_to_£500k"], + "£500k_to_£1m": vat_liability_band_latest["£500k_to_£1m"], + "£1m_to_£10m": vat_liability_band_latest["£1m_to_£10m"], + "Greater_than_£10m": vat_liability_band_latest[ + "Greater_than_£10m" + ], + } + + # Calculate synthetic VAT liability by turnover band (VAT-registered firms only) + vat_registered_firms = synthetic_df[ + synthetic_df["vat_registered"] == True + ] + synthetic_vat_band_liability = ( + vat_registered_firms.groupby("hmrc_band") + .apply(lambda x: (x["vat_liability_k"] * x["weight"]).sum()) + .reset_index(name="synthetic_vat_liability_k") + ) + + self._print_validation_section( + "VAT LIABILITY BY TURNOVER BAND VALIDATION" + ) + print( + f"{'Band':>25} {'Synthetic (£m)':>14} {'Target (£m)':>12} {'Accuracy':>10}" + ) + print("-" * 75) + + vat_liability_band_accuracies = [] + for band_name, target_millions in vat_liability_band_targets.items(): + # Get synthetic VAT liability for this band + band_data = synthetic_vat_band_liability[ + synthetic_vat_band_liability["hmrc_band"] == band_name + ] + if not band_data.empty: + synthetic_k = band_data["synthetic_vat_liability_k"].iloc[0] + else: + synthetic_k = 0 + + synthetic_millions = ( + synthetic_k / 1000 + ) # convert back to millions for display + + # Calculate accuracy (handle negative targets) + if abs(target_millions) > 0.1: # Skip near-zero targets + if target_millions < 0 and synthetic_millions < 0: + # Both negative - check relative difference + accuracy = 1 - min( + abs(synthetic_millions - target_millions) + / abs(target_millions), + 1.0, + ) + elif target_millions > 0 and synthetic_millions > 0: + # Both positive - check relative difference + accuracy = 1 - min( + abs(synthetic_millions - target_millions) + / target_millions, + 1.0, + ) + else: + # Sign mismatch - poor accuracy + accuracy = max( + 0, + 1 + - abs(synthetic_millions - target_millions) + / max(abs(target_millions), 1), + ) + else: + accuracy = 1.0 if abs(synthetic_millions) < 1 else 0.0 + + vat_liability_band_accuracies.append(accuracy) + + status = ( + "✓" if accuracy > 0.70 else "⚠" if accuracy > 0.50 else "✗" + ) + print( + f" {status} {band_name:>22}: {synthetic_millions:>12.1f} vs {target_millions:>10.1f} ({accuracy:>6.1%})" + ) + + print("-" * 75) + vat_liability_band_accuracy = self._print_accuracy_breakdown( + vat_liability_band_accuracies, + len(vat_liability_band_accuracies), + "VAT LIABILITY BY BAND", + ) + + # === FINAL SUMMARY === + overall_accuracy = ( + hmrc_accuracy + + ons_population_accuracy + + employment_accuracy + + sector_accuracy + + vat_liability_sector_accuracy + + vat_liability_band_accuracy + ) / 6 + + self._print_validation_section("CALIBRATION SUMMARY", 80) + print(f"HMRC Turnover Bands: {hmrc_accuracy:.1%}") + print(f"ONS Population: {ons_population_accuracy:.1%}") + print(f"Employment Bands: {employment_accuracy:.1%}") + print(f"Sector Distribution: {sector_accuracy:.1%}") + print(f"VAT Liability by Sector: {vat_liability_sector_accuracy:.1%}") + print(f"VAT Liability by Band: {vat_liability_band_accuracy:.1%}") + print(f"Overall Accuracy: {overall_accuracy:.1%}") + print(f"Total Population: {total_synthetic_weighted:,.0f} firms") + + return ( + hmrc_accuracy, + ons_population_accuracy, + employment_accuracy, + sector_accuracy, + vat_liability_sector_accuracy, + vat_liability_band_accuracy, + ) + + def generate_synthetic_firms(self) -> pd.DataFrame: + """Main function to generate comprehensive synthetic firms population. + + Creates complete firm dataset with VAT registration flags, calibrated + to match official ONS and HMRC data sources. + + Returns: + DataFrame with synthetic firms data including VAT registration flags + """ + logger.info("Starting synthetic firm generation...") + + # Load data + ( + ons_df, + ons_employment_df, + hmrc_turnover_df, + hmrc_sector_df, + vat_liability_df, + vat_liability_band_df, + ons_total, + ) = self.load_data() + + # Extract HMRC targets (VAT-registered firms only) + hmrc_latest = hmrc_turnover_df.iloc[-1] + hmrc_bands = { + "Negative_or_Zero": hmrc_latest["Negative_or_Zero"], + "£1_to_Threshold": hmrc_latest["£1_to_Threshold"], + "£Threshold_to_£150k": hmrc_latest["£Threshold_to_£150k"], + "£150k_to_£300k": hmrc_latest["£150k_to_£300k"], + "£300k_to_£500k": hmrc_latest["£300k_to_£500k"], + "£500k_to_£1m": hmrc_latest["£500k_to_£1m"], + "£1m_to_£10m": hmrc_latest["£1m_to_£10m"], + "Greater_than_£10m": hmrc_latest["Greater_than_£10m"], + } + + logger.info(f"Target populations:") + logger.info( + f" ONS total firms: {ons_total:,} (includes all businesses)" + ) + logger.info( + f" HMRC VAT firms: {sum(hmrc_bands.values()):,} (VAT-registered only)" + ) + + # Generate base firms from ONS structure + base_sic_codes, base_turnover = self.generate_base_firms(ons_df) + + # Generate input values for firms + base_input = self.generate_input_values(base_turnover, base_sic_codes) + + # Create target matrix for multi-objective optimization + target_matrix, target_values = self.create_comprehensive_target_matrix( + base_turnover, + base_sic_codes, + base_input, + hmrc_bands, + hmrc_sector_df, + ons_employment_df, + vat_liability_df, + vat_liability_band_df, + ons_total, + ) + + # Optimize weights to match calibration targets + optimized_weights = self.optimize_weights(target_matrix, target_values) + + # Apply final calibration (add zero firms manually) + final_sic_codes, final_turnover, final_input, final_weights = ( + self.apply_final_calibration( + base_sic_codes, + base_turnover, + base_input, + optimized_weights, + hmrc_bands, + ) + ) + + # Assign employment to final firms + employment_values = self.assign_employment( + len(final_sic_codes), ons_employment_df + ) + + # Assign VAT registration flags + vat_flags = self.assign_vat_registration_flags( + final_turnover, hmrc_bands + ) + + # Convert to DataFrame + logger.info("Converting to final DataFrame...") + sic_codes_np = final_sic_codes.cpu().numpy().astype(int) + turnover_np = final_turnover.cpu().numpy() + input_np = final_input.cpu().numpy() + vat_liability_np = turnover_np - input_np # Calculate VAT liability + + synthetic_df = pd.DataFrame( + { + "sic_code": [str(sic).zfill(5) for sic in sic_codes_np], + "annual_turnover_k": turnover_np, + "annual_input_k": input_np, + "vat_liability_k": vat_liability_np, + "employment": employment_values.cpu().numpy().astype(int), + "weight": final_weights.cpu().numpy(), + "vat_registered": vat_flags.cpu().numpy().astype(bool), + } + ) + + logger.info(f"Generated firm dataset:") + logger.info(f" Total firms: {len(synthetic_df):,}") + logger.info( + f" Weighted population: {synthetic_df['weight'].sum():,.0f}" + ) + + # Validation against all data sources + self.validate_comprehensive_accuracy( + synthetic_df, + hmrc_bands, + ons_total, + ons_employment_df, + hmrc_sector_df, + vat_liability_df, + vat_liability_band_df, + ) + + return synthetic_df + + +def main(): + """Main execution function.""" + logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" + ) + + logger.info("UK BUSINESS POPULATION SYNTHETIC DATA GENERATION") + logger.info( + "Generating synthetic firm records using PyTorch for efficient processing" + ) + + # Initialize generator + generator = SyntheticFirmGenerator( + device="cpu" # Use CPU for compatibility + ) + + # Generate synthetic data + synthetic_df = generator.generate_synthetic_firms() + + # Log results (CSV output now temporary, not saved to disk) + logger.info(f"Generated {len(synthetic_df):,} synthetic firms") + logger.info( + f"Data size: {synthetic_df.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB" + ) + logger.info(f"Columns: {list(synthetic_df.columns)}") + + logger.info("Synthetic data generation complete!") + + +if __name__ == "__main__": + main()