From d36df670278a7d8c2122e32c793ab2ee51ebd528 Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Wed, 7 Aug 2024 17:30:32 -0800 Subject: [PATCH 01/34] add apache cassandra astra as source --- examples/apachecassandra_to_pinecone.yaml | 24 +++++++++ vector_etl/source_mods/__init__.py | 3 ++ vector_etl/source_mods/airtable_loader.py | 0 .../apache_cassandra_astra_loader.py | 49 +++++++++++++++++++ vector_etl/source_mods/base.py | 3 ++ 5 files changed, 79 insertions(+) create mode 100644 examples/apachecassandra_to_pinecone.yaml create mode 100644 vector_etl/source_mods/airtable_loader.py create mode 100644 vector_etl/source_mods/apache_cassandra_astra_loader.py diff --git a/examples/apachecassandra_to_pinecone.yaml b/examples/apachecassandra_to_pinecone.yaml new file mode 100644 index 0000000..6b93fd0 --- /dev/null +++ b/examples/apachecassandra_to_pinecone.yaml @@ -0,0 +1,24 @@ +source: + source_data_type: "Apache Cassandra" + db_type: "cassandra_astra" + clientId: "" + secret: "" + keyspace: "sales" + secure_connect_bundle: "secure-connect-contextdata.zip" + query: "SELECT * FROM chipotle_stores LIMIT 10" + +embedding: + embedding_model: "OpenAI" + api_key: "" + model_name: "text-embedding-ada-002" + +target: + target_database: "Pinecone" + pinecone_api_key: "" + index_name: "" + dimension: 1536 #[Optional] Only required if creating a new index + metric: "cosine" #[Optional] Only required if creating a new index + cloud: "aws" #[Optional] Only required if creating a new index + region: "us-east-1" #[Optional] Only required if creating a new index + + diff --git a/vector_etl/source_mods/__init__.py b/vector_etl/source_mods/__init__.py index 035255f..6f4cf62 100644 --- a/vector_etl/source_mods/__init__.py +++ b/vector_etl/source_mods/__init__.py @@ -8,6 +8,7 @@ from .zendesk_loader import ZendeskSource from .google_drive import GoogleDriveSource from .google_cloud_storage import GoogleCloudStorageSource +from .apache_cassandra_astra_loader import ApacheCassandraAstraSource from .local_file import LocalFileSource def get_source_class(config): @@ -30,5 +31,7 @@ def get_source_class(config): return GoogleDriveSource(config) elif source_type == 'Google Cloud Storage': return GoogleCloudStorageSource(config) + elif source_type == 'Apache Cassandra': + return ApacheCassandraAstraSource(config) else: raise ValueError(f"Unsupported source type: {source_type}") diff --git a/vector_etl/source_mods/airtable_loader.py b/vector_etl/source_mods/airtable_loader.py new file mode 100644 index 0000000..e69de29 diff --git a/vector_etl/source_mods/apache_cassandra_astra_loader.py b/vector_etl/source_mods/apache_cassandra_astra_loader.py new file mode 100644 index 0000000..f4c394c --- /dev/null +++ b/vector_etl/source_mods/apache_cassandra_astra_loader.py @@ -0,0 +1,49 @@ +import pandas as pd +from cassandra.cluster import Cluster +import logging +from base import BaseSource +from cassandra.auth import PlainTextAuthProvider + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class ApacheCassandraAstraSource(BaseSource): + def __init__(self, config): + self.config = config + self.cluster = None + self.auth_provider = None + self.session = None + self.cloud_config = None + self.keyspace = self.config['keyspace'] + self.connect() # Initialize connection here + + def connect(self): + if self.config["db_type"] == 'cassandra_astra': + self.cloud_config = {'secure_connect_bundle': self.config['secure_connect_bundle']} + self.auth_provider = PlainTextAuthProvider(self.config['clientId'], self.config['secret']) + self.cluster = Cluster(cloud=self.cloud_config, auth_provider=self.auth_provider,protocol_version=3) + self.session = self.cluster.connect(self.keyspace) + else: + raise ValueError("Invalid database type") + + def fetch_data(self): + if not self.session: + raise Exception("Session is not initialized. Ensure you call connect() first.") + + query = self.config.get("query", "") + prepared_statement = self.session.prepare(query) + + try: + db_data = self.session.execute(prepared_statement) + if db_data: + # Convert to Pandas DataFrame + df = pd.DataFrame(list(db_data)) + return df + else: + logger.error(f"No data returned: {e}") + return None + except Exception as e: + logger.error(f"An error occurred: {e}") + return None + diff --git a/vector_etl/source_mods/base.py b/vector_etl/source_mods/base.py index 1041dc3..01c7683 100644 --- a/vector_etl/source_mods/base.py +++ b/vector_etl/source_mods/base.py @@ -8,3 +8,6 @@ def connect(self): @abstractmethod def fetch_data(self): pass + + + From 5d47ca5be99e051747f983cb18946b3f4b06a79a Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Fri, 9 Aug 2024 03:06:48 -0800 Subject: [PATCH 02/34] add bigquery and airtable --- vector_etl/source_mods/__init__.py | 6 ++++ vector_etl/source_mods/airtable_loader.py | 38 ++++++++++++++++++++++ vector_etl/source_mods/google_bigquery.py | 39 +++++++++++++++++++++++ 3 files changed, 83 insertions(+) create mode 100644 vector_etl/source_mods/google_bigquery.py diff --git a/vector_etl/source_mods/__init__.py b/vector_etl/source_mods/__init__.py index 6f4cf62..0f93ad3 100644 --- a/vector_etl/source_mods/__init__.py +++ b/vector_etl/source_mods/__init__.py @@ -10,6 +10,8 @@ from .google_cloud_storage import GoogleCloudStorageSource from .apache_cassandra_astra_loader import ApacheCassandraAstraSource from .local_file import LocalFileSource +from .airtable_loader import AirTableSource +from .google_bigquery import GoogleBigQuerySource def get_source_class(config): source_type = config['source_data_type'] @@ -33,5 +35,9 @@ def get_source_class(config): return GoogleCloudStorageSource(config) elif source_type == 'Apache Cassandra': return ApacheCassandraAstraSource(config) + elif source_type == 'AirTable': + return AirTableSource(config) + elif source_type == 'Google BigQuery': + return GoogleBigQuerySource(config) else: raise ValueError(f"Unsupported source type: {source_type}") diff --git a/vector_etl/source_mods/airtable_loader.py b/vector_etl/source_mods/airtable_loader.py index e69de29..e9bbfe9 100644 --- a/vector_etl/source_mods/airtable_loader.py +++ b/vector_etl/source_mods/airtable_loader.py @@ -0,0 +1,38 @@ +import requests +from base import BaseSource +from pprint import pprint +import pandas as pd +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class AirTableSource(BaseSource): + def __init__(self,config): + self.config = config + self.url = f"{self.config['url']}{self.config['baseId']}/{self.config['tableIdOrName']}" + self.auth_token = config['auth_token'] + + def connect(self): + headers = { + "Authorization": f"Bearer {self.auth_token}" + } + try: + response = requests.get(self.url,headers=headers) + data = response.json()['records'] + return data + except Exception as e: + logger.error(f"An error occurred: {e}") + return None + + def fetch_data(self): + records = self.connect() + df_data = [data['fields'] for data in records ] + airtable_df = pd.DataFrame(df_data) + + return airtable_df + + + + diff --git a/vector_etl/source_mods/google_bigquery.py b/vector_etl/source_mods/google_bigquery.py new file mode 100644 index 0000000..3b98e43 --- /dev/null +++ b/vector_etl/source_mods/google_bigquery.py @@ -0,0 +1,39 @@ +import os +from google.cloud import bigquery +from base import BaseSource +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class GoogleBigQuerySource(BaseSource): + def __init__(self,config): + self.config = config + self.google_application_credentials = config['GOOGLE_APPLICATION_CREDENTIALS'] + self.client = None + self.connect() + + + def connect(self): + os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = self.google_application_credentials + self.client = bigquery.Client() + + def fetch_data(self): + if self.client: + try: + query_job = self.client.query(f"""{self.config.get("query"," ")}""") + if query_job: + dfrows = query_job.result().to_dataframe() + return dfrows + else: + logger.error(f"No data returned: {e}") + return None + except Exception as e: + logger.error(f"An error occurred: {e}") + return None + + + + + From a2a6663cb7d685d8f2e43b8f48770a5f71737c59 Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Sat, 10 Aug 2024 11:29:32 -0800 Subject: [PATCH 03/34] add test case for all sources --- examples/airtable_to_pincone.yaml | 22 +++++ examples/google_bigquery_to_pincone.yaml | 20 ++++ tests/test_source_mods.py | 113 ++++++++++++++++++++++ vector_etl/source_mods/google_bigquery.py | 6 +- vector_etl/target_mods/__init__.py | 1 + 5 files changed, 159 insertions(+), 3 deletions(-) create mode 100644 examples/airtable_to_pincone.yaml create mode 100644 examples/google_bigquery_to_pincone.yaml diff --git a/examples/airtable_to_pincone.yaml b/examples/airtable_to_pincone.yaml new file mode 100644 index 0000000..c47fc21 --- /dev/null +++ b/examples/airtable_to_pincone.yaml @@ -0,0 +1,22 @@ +source: + source_data_type: "AirTable" + url: "" + auth_token: "" + baseId: "sales" + tableIdOrName: "secure-connect-contextdata.zip" + +embedding: + embedding_model: "OpenAI" + api_key: "" + model_name: "text-embedding-ada-002" + +target: + target_database: "Pinecone" + pinecone_api_key: "" + index_name: "" + dimension: 1536 #[Optional] Only required if creating a new index + metric: "cosine" #[Optional] Only required if creating a new index + cloud: "aws" #[Optional] Only required if creating a new index + region: "us-east-1" #[Optional] Only required if creating a new index + + diff --git a/examples/google_bigquery_to_pincone.yaml b/examples/google_bigquery_to_pincone.yaml new file mode 100644 index 0000000..3075154 --- /dev/null +++ b/examples/google_bigquery_to_pincone.yaml @@ -0,0 +1,20 @@ +source: + source_data_type: "Google BigQuery" + google_application_credentials: "" + query: "" + +embedding: + embedding_model: "OpenAI" + api_key: "" + model_name: "text-embedding-ada-002" + +target: + target_database: "Pinecone" + pinecone_api_key: "" + index_name: "" + dimension: 1536 #[Optional] Only required if creating a new index + metric: "cosine" #[Optional] Only required if creating a new index + cloud: "aws" #[Optional] Only required if creating a new index + region: "us-east-1" #[Optional] Only required if creating a new index + + diff --git a/tests/test_source_mods.py b/tests/test_source_mods.py index 103b23e..bb10990 100644 --- a/tests/test_source_mods.py +++ b/tests/test_source_mods.py @@ -5,6 +5,9 @@ from vector_etl.source_mods.s3_loader import S3Source from vector_etl.source_mods.database_loader import DatabaseSource from vector_etl.source_mods.local_file import LocalFileSource +from vector_etl.source_mods.google_bigquery import GoogleBigQuerySource +from vector_etl.source_mods.airtable_loader import AirTableSource +from vector_etl.source_mods.apache_cassandra_astra_loader import ApacheCassandraAstraSource @pytest.fixture def s3_config(): @@ -16,7 +19,41 @@ def s3_config(): 'chunk_size': 1000, 'chunk_overlap': 200 } + +@pytest.fixture +def google_bigquery_config(): + return { + "source_data_type": "Google BigQuery", + "google_application_credentials": "", + "query": "SELECT * FROM chipotle_stores LIMIT 10" + + } + +@pytest.fixture +def airtable_config(): + return { + "url":"airttable.com/sales", + "baseId":"sales", + "auth_token":"673989fhuhefiw0903", + "tableIdOrName":"survey" + } + + + + +@pytest.fixture +def apache_cassandar_astra_config(): + return { + "source_data_type": "Apache Cassandra", + "db_type": "cassandra_astra", + "clientId": "", + "secret": "", + "keyspace": "sales", + "secure_connect_bundle": "secure-connect-contextdata.zip", + "query": "SELECT * FROM chipotle_stores LIMIT 10", + } + @pytest.fixture def db_config(): return { @@ -99,4 +136,80 @@ def test_local_file_source_read_file(local_file_config): source = LocalFileSource(local_file_config) file_content = source.read_file('/path/to/test_file.csv') assert isinstance(file_content, BytesIO) + + +def test_google_bigquery_connect(google_bigquery_config): + with patch('bigquery.connect') as mock_connect: + source = GoogleBigQuerySource(google_bigquery_config) + source.connect() + mock_connect.assert_called_once_with( + source_data_type="Google BigQuery", + google_application_credentials="", + query="SELECT * FROM chipotle_stores LIMIT 10" + ) + + +def test_google_bigquery_fetch_data(google_bigquery_config): + with patch('bigquery.connect') as mock_connect: + mock_connect.result.to_dataframe.return_value = pd.DataFrame() + source = GoogleBigQuerySource(db_config) + df = source.fetch_data() + assert isinstance(df, pd.DataFrame) + + + + +def test_apache_cassandra_astra_connect(apache_cassandar_astra_config): + with patch('cassandra.cluster') as mock_connect: + source = ApacheCassandraAstraSource(apache_cassandar_astra_config) + source.connect() + mock_connect.assert_called_once_with( + source_data_type="Apache Cassandra", + db_type="cassandra_astra", + clientId= "", + secret= "", + keyspace="sales", + secure_connect_bundle="secure-connect-contextdata.zip", + query ="SELECT * FROM chipotle_stores LIMIT 10", + ) + + +def test_apache_cassandra_astra_fetch_data(apache_cassandar_astra_config): + with patch('cassandra.cluster') as mock_connect: + mock_connect.session.execute.return_value = [{"id":"","name":""}] + source = ApacheCassandraAstraSource(db_config) + df = source.fetch_data() + assert isinstance(df, pd.DataFrame) + + + + +def test_airtable_connect(airtable_config): + + with patch('requests.get') as mock_connect: + source = AirTableSource(airtable_config) + source.connect() + mock_connect.assert_called_once_with( + url="Apache Cassandra", + baseId="cassandra_astra", + tableIdOrName= "", + auth_token="secure-connect-contextdata.zip", + + ) + + +def test_airtable_fetch_data(airtable_config): + with patch('requests.get') as mock_connect: + mock_connect.return_value = [ { + "Address": "333 Post St", + "Name": "Union Square", + "Visited": True + } + ] + + source = AirTableSource(db_config) + df = source.fetch_data() + + assert isinstance(df, pd.DataFrame) + diff --git a/vector_etl/source_mods/google_bigquery.py b/vector_etl/source_mods/google_bigquery.py index 3b98e43..fb1460a 100644 --- a/vector_etl/source_mods/google_bigquery.py +++ b/vector_etl/source_mods/google_bigquery.py @@ -10,14 +10,14 @@ class GoogleBigQuerySource(BaseSource): def __init__(self,config): self.config = config - self.google_application_credentials = config['GOOGLE_APPLICATION_CREDENTIALS'] self.client = None self.connect() def connect(self): - os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = self.google_application_credentials - self.client = bigquery.Client() + if self.config["db_type"] == 'google_bigquery': + os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = self.config['GOOGLE_APPLICATION_CREDENTIALS'] + self.client = bigquery.Client() def fetch_data(self): if self.client: diff --git a/vector_etl/target_mods/__init__.py b/vector_etl/target_mods/__init__.py index f0be443..f19dd9b 100644 --- a/vector_etl/target_mods/__init__.py +++ b/vector_etl/target_mods/__init__.py @@ -8,6 +8,7 @@ from .mongodb import MongoDBTarget from .neo4j import Neo4jTarget + def get_target_database(config): target_type = config['target_database'] if target_type == 'Pinecone': From ce01cd3373e15afe9ab5e95e33e6ec1546729018 Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Tue, 20 Aug 2024 04:01:01 -0800 Subject: [PATCH 04/34] removed atlas db source --- examples/apachecassandra_to_pinecone.yaml | 24 --------- tests/test_source_mods.py | 36 -------------- vector_etl/source_mods/__init__.py | 3 -- .../apache_cassandra_astra_loader.py | 49 ------------------- 4 files changed, 112 deletions(-) delete mode 100644 examples/apachecassandra_to_pinecone.yaml delete mode 100644 vector_etl/source_mods/apache_cassandra_astra_loader.py diff --git a/examples/apachecassandra_to_pinecone.yaml b/examples/apachecassandra_to_pinecone.yaml deleted file mode 100644 index 6b93fd0..0000000 --- a/examples/apachecassandra_to_pinecone.yaml +++ /dev/null @@ -1,24 +0,0 @@ -source: - source_data_type: "Apache Cassandra" - db_type: "cassandra_astra" - clientId: "" - secret: "" - keyspace: "sales" - secure_connect_bundle: "secure-connect-contextdata.zip" - query: "SELECT * FROM chipotle_stores LIMIT 10" - -embedding: - embedding_model: "OpenAI" - api_key: "" - model_name: "text-embedding-ada-002" - -target: - target_database: "Pinecone" - pinecone_api_key: "" - index_name: "" - dimension: 1536 #[Optional] Only required if creating a new index - metric: "cosine" #[Optional] Only required if creating a new index - cloud: "aws" #[Optional] Only required if creating a new index - region: "us-east-1" #[Optional] Only required if creating a new index - - diff --git a/tests/test_source_mods.py b/tests/test_source_mods.py index bb10990..c83dfe5 100644 --- a/tests/test_source_mods.py +++ b/tests/test_source_mods.py @@ -7,7 +7,6 @@ from vector_etl.source_mods.local_file import LocalFileSource from vector_etl.source_mods.google_bigquery import GoogleBigQuerySource from vector_etl.source_mods.airtable_loader import AirTableSource -from vector_etl.source_mods.apache_cassandra_astra_loader import ApacheCassandraAstraSource @pytest.fixture def s3_config(): @@ -40,20 +39,6 @@ def airtable_config(): } - - -@pytest.fixture -def apache_cassandar_astra_config(): - return { - "source_data_type": "Apache Cassandra", - "db_type": "cassandra_astra", - "clientId": "", - "secret": "", - "keyspace": "sales", - "secure_connect_bundle": "secure-connect-contextdata.zip", - "query": "SELECT * FROM chipotle_stores LIMIT 10", - } - @pytest.fixture def db_config(): return { @@ -159,27 +144,6 @@ def test_google_bigquery_fetch_data(google_bigquery_config): -def test_apache_cassandra_astra_connect(apache_cassandar_astra_config): - with patch('cassandra.cluster') as mock_connect: - source = ApacheCassandraAstraSource(apache_cassandar_astra_config) - source.connect() - mock_connect.assert_called_once_with( - source_data_type="Apache Cassandra", - db_type="cassandra_astra", - clientId= "", - secret= "", - keyspace="sales", - secure_connect_bundle="secure-connect-contextdata.zip", - query ="SELECT * FROM chipotle_stores LIMIT 10", - ) - - -def test_apache_cassandra_astra_fetch_data(apache_cassandar_astra_config): - with patch('cassandra.cluster') as mock_connect: - mock_connect.session.execute.return_value = [{"id":"","name":""}] - source = ApacheCassandraAstraSource(db_config) - df = source.fetch_data() - assert isinstance(df, pd.DataFrame) diff --git a/vector_etl/source_mods/__init__.py b/vector_etl/source_mods/__init__.py index 0f93ad3..9dae03a 100644 --- a/vector_etl/source_mods/__init__.py +++ b/vector_etl/source_mods/__init__.py @@ -8,7 +8,6 @@ from .zendesk_loader import ZendeskSource from .google_drive import GoogleDriveSource from .google_cloud_storage import GoogleCloudStorageSource -from .apache_cassandra_astra_loader import ApacheCassandraAstraSource from .local_file import LocalFileSource from .airtable_loader import AirTableSource from .google_bigquery import GoogleBigQuerySource @@ -33,8 +32,6 @@ def get_source_class(config): return GoogleDriveSource(config) elif source_type == 'Google Cloud Storage': return GoogleCloudStorageSource(config) - elif source_type == 'Apache Cassandra': - return ApacheCassandraAstraSource(config) elif source_type == 'AirTable': return AirTableSource(config) elif source_type == 'Google BigQuery': diff --git a/vector_etl/source_mods/apache_cassandra_astra_loader.py b/vector_etl/source_mods/apache_cassandra_astra_loader.py deleted file mode 100644 index f4c394c..0000000 --- a/vector_etl/source_mods/apache_cassandra_astra_loader.py +++ /dev/null @@ -1,49 +0,0 @@ -import pandas as pd -from cassandra.cluster import Cluster -import logging -from base import BaseSource -from cassandra.auth import PlainTextAuthProvider - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -class ApacheCassandraAstraSource(BaseSource): - def __init__(self, config): - self.config = config - self.cluster = None - self.auth_provider = None - self.session = None - self.cloud_config = None - self.keyspace = self.config['keyspace'] - self.connect() # Initialize connection here - - def connect(self): - if self.config["db_type"] == 'cassandra_astra': - self.cloud_config = {'secure_connect_bundle': self.config['secure_connect_bundle']} - self.auth_provider = PlainTextAuthProvider(self.config['clientId'], self.config['secret']) - self.cluster = Cluster(cloud=self.cloud_config, auth_provider=self.auth_provider,protocol_version=3) - self.session = self.cluster.connect(self.keyspace) - else: - raise ValueError("Invalid database type") - - def fetch_data(self): - if not self.session: - raise Exception("Session is not initialized. Ensure you call connect() first.") - - query = self.config.get("query", "") - prepared_statement = self.session.prepare(query) - - try: - db_data = self.session.execute(prepared_statement) - if db_data: - # Convert to Pandas DataFrame - df = pd.DataFrame(list(db_data)) - return df - else: - logger.error(f"No data returned: {e}") - return None - except Exception as e: - logger.error(f"An error occurred: {e}") - return None - From b4a8e3909630a554bcb794b45d662b8cc053f4c0 Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Tue, 20 Aug 2024 13:12:59 -0800 Subject: [PATCH 05/34] updated base module import --- vector_etl/source_mods/airtable_loader.py | 2 +- vector_etl/source_mods/google_bigquery.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vector_etl/source_mods/airtable_loader.py b/vector_etl/source_mods/airtable_loader.py index e9bbfe9..8c4566e 100644 --- a/vector_etl/source_mods/airtable_loader.py +++ b/vector_etl/source_mods/airtable_loader.py @@ -1,5 +1,5 @@ import requests -from base import BaseSource +from .base import BaseSource from pprint import pprint import pandas as pd import logging diff --git a/vector_etl/source_mods/google_bigquery.py b/vector_etl/source_mods/google_bigquery.py index fb1460a..affa5f9 100644 --- a/vector_etl/source_mods/google_bigquery.py +++ b/vector_etl/source_mods/google_bigquery.py @@ -1,6 +1,6 @@ import os from google.cloud import bigquery -from base import BaseSource +from .base import BaseSource import logging logging.basicConfig(level=logging.INFO) From f7232337bcdb3580b5f5c045f17f5d826cbae20a Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Wed, 21 Aug 2024 05:07:53 -0800 Subject: [PATCH 06/34] update airtablesource url to default https://api.airtable.com/v0/ --- examples/airtable_to_pincone.yaml | 3 +-- vector_etl/main.py | 1 + vector_etl/source_mods/airtable_loader.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/airtable_to_pincone.yaml b/examples/airtable_to_pincone.yaml index c47fc21..5e95087 100644 --- a/examples/airtable_to_pincone.yaml +++ b/examples/airtable_to_pincone.yaml @@ -1,9 +1,8 @@ source: source_data_type: "AirTable" - url: "" auth_token: "" baseId: "sales" - tableIdOrName: "secure-connect-contextdata.zip" + tableIdOrName: "" embedding: embedding_model: "OpenAI" diff --git a/vector_etl/main.py b/vector_etl/main.py index 983ec51..e201875 100644 --- a/vector_etl/main.py +++ b/vector_etl/main.py @@ -6,6 +6,7 @@ from vector_etl import __version__, run_etl_process from vector_etl.orchestrator import run_etl_process + logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) diff --git a/vector_etl/source_mods/airtable_loader.py b/vector_etl/source_mods/airtable_loader.py index 8c4566e..0e14b58 100644 --- a/vector_etl/source_mods/airtable_loader.py +++ b/vector_etl/source_mods/airtable_loader.py @@ -11,7 +11,7 @@ class AirTableSource(BaseSource): def __init__(self,config): self.config = config - self.url = f"{self.config['url']}{self.config['baseId']}/{self.config['tableIdOrName']}" + self.url = f"https://api.airtable.com/v0/{self.config['baseId']}/{self.config['tableIdOrName']}" self.auth_token = config['auth_token'] def connect(self): From 939ae2898b037fe05345c82a612a4ef11bd4136f Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Fri, 23 Aug 2024 17:48:22 -0800 Subject: [PATCH 07/34] added hubspot source --- vector_etl/source_mods/__init__.py | 3 + vector_etl/source_mods/google_bigquery.py | 13 ++- vector_etl/source_mods/hubspot_loader.py | 124 ++++++++++++++++++++++ 3 files changed, 138 insertions(+), 2 deletions(-) create mode 100644 vector_etl/source_mods/hubspot_loader.py diff --git a/vector_etl/source_mods/__init__.py b/vector_etl/source_mods/__init__.py index 9dae03a..8206443 100644 --- a/vector_etl/source_mods/__init__.py +++ b/vector_etl/source_mods/__init__.py @@ -11,6 +11,7 @@ from .local_file import LocalFileSource from .airtable_loader import AirTableSource from .google_bigquery import GoogleBigQuerySource +from .hubspot_loader import HubSpotSource def get_source_class(config): source_type = config['source_data_type'] @@ -36,5 +37,7 @@ def get_source_class(config): return AirTableSource(config) elif source_type == 'Google BigQuery': return GoogleBigQuerySource(config) + elif source_type == 'HubSpot': + return HubSpotSource(config) else: raise ValueError(f"Unsupported source type: {source_type}") diff --git a/vector_etl/source_mods/google_bigquery.py b/vector_etl/source_mods/google_bigquery.py index affa5f9..3a93a99 100644 --- a/vector_etl/source_mods/google_bigquery.py +++ b/vector_etl/source_mods/google_bigquery.py @@ -1,6 +1,6 @@ import os from google.cloud import bigquery -from .base import BaseSource +from base import BaseSource import logging logging.basicConfig(level=logging.INFO) @@ -35,5 +35,14 @@ def fetch_data(self): - +config = {"query":"SELECT * FROM bigquery-public-data.america_health_rankings.ahr LIMIT 100", + "GOOGLE_APPLICATION_CREDENTIALS":"contextData_bigquery_cred.json", + "db_type":'google_bigquery' + } + + +data = GoogleBigQuerySource(config) +print(type(data.fetch_data())) +print(data.fetch_data()) + diff --git a/vector_etl/source_mods/hubspot_loader.py b/vector_etl/source_mods/hubspot_loader.py new file mode 100644 index 0000000..5b7d381 --- /dev/null +++ b/vector_etl/source_mods/hubspot_loader.py @@ -0,0 +1,124 @@ +from .base import BaseSource +import pandas as pd +import requests +import logging + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class HubSpotSource(BaseSource): + def __init__(self,config): + self.config = config + self.endpoints = None + self.access_token = self.config["access_token"] + + def connect(self,url): + headers = { + "authorization":f"Bearer {self.access_token}" + } + try: + response = requests.get(url=url,headers=headers) + logger.info(f"Status {response.status_code}") + return response.json() + except Exception as e: + logger.error(f"An error occurred: {e}") + + def fetch_data(self): + + if self.config['crm_object'] == "crm.companies": + self.endpoints = f"https://api.hubapi.com/crm/v3/objects/companies?limit={self.config['limit']}&archived={self.config['archive']}" + logger.info(f"Companies \n") + + elif self.config['crm_object'] == "crm.contacts": + self.endpoints = f"https://api.hubapi.com/crm/v3/objects/contacts?limit={self.config['limit']}&archived={self.config['archive']}" + logger.info(f"Contacts \n") + + elif self.config['crm_object'] == "crm.tickets": + self.endpoints = f"https://api.hubapi.com/crm/v3/objects/tickets?limit={self.config['limit']}&archived={self.config['archive']}" + logger.info(f"Tickets \n") + + elif self.config['crm_object'] == "crm.deals": + self.endpoints = f"https://api.hubapi.com/crm/v3/objects/deals?limit={self.config['limit']}&archived={self.config['archive']}" + logger.info(f"Deals \n") + + elif self.config['crm_object'] == "crm.products": + self.endpoints = f"https://api.hubapi.com/crm/v3/objects/products?limit={self.config['limit']}&archived={self.config['archive']}" + logger.info(f"products \n") + + + elif self.config['crm_object'] == "crm.invoices": + self.endpoints = f"https://api.hubapi.com/crm/v3/objects/invoices?limit={self.config['limit']}&archived={self.config['archive']}" + logger.info(f"invoices \n") + + elif self.config['crm_object'] == "crm.carts": + self.endpoints = f"https://api.hubapi.com/crm/v3/objects/carts?limit={self.config['limit']}&archived={self.config['archive']}" + logger.info(f"Carts \n") + + elif self.config['crm_object'] == "crm.tasks": + self.endpoints = f"https://api.hubapi.com/crm/v3/objects/tasks?limit={self.config['limit']}&archived={self.config['archive']}" + logger.info(f"Tasks \n") + + elif self.config['crm_object'] == "crm.payments": + self.endpoints = f"https://api.hubapi.com/crm/v3/objects/commerce_payments?limit={self.config['limit']}&archived={self.config['archive']}" + logger.info(f"Payments \n") + + elif self.config['crm_object'] == "crm.orders": + self.endpoints = f"https://api.hubapi.com/crm/v3/objects/orders?limit={self.config['limit']}&archived={self.config['archive']}" + logger.info(f"Orders \n") + + else: + raise ValueError(f"Unsupported Crm object type: check the object name {self.config['crm_object']}") + + + response = self.connect(self.endpoints)['results'] + results = [results['properties'] for results in response] + df = pd.DataFrame(results) + logger.info(f" data \n {df}") + + return df + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From ff45f2f7894d731f8ad44616b3ea25247ba9fd0e Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Sat, 24 Aug 2024 13:25:42 -0800 Subject: [PATCH 08/34] added zoho crm source --- vector_etl/source_mods/__init__.py | 4 +- vector_etl/source_mods/zoho_crm_loader.py | 153 ++++++++++++++++++++++ 2 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 vector_etl/source_mods/zoho_crm_loader.py diff --git a/vector_etl/source_mods/__init__.py b/vector_etl/source_mods/__init__.py index 8206443..3014119 100644 --- a/vector_etl/source_mods/__init__.py +++ b/vector_etl/source_mods/__init__.py @@ -12,7 +12,7 @@ from .airtable_loader import AirTableSource from .google_bigquery import GoogleBigQuerySource from .hubspot_loader import HubSpotSource - +from .zoho_crm_loader import ZohoCrmSource def get_source_class(config): source_type = config['source_data_type'] if source_type == 'Amazon S3': @@ -39,5 +39,7 @@ def get_source_class(config): return GoogleBigQuerySource(config) elif source_type == 'HubSpot': return HubSpotSource(config) + elif source_type == 'Zoho Crm': + return ZohoCrmSource(config) else: raise ValueError(f"Unsupported source type: {source_type}") diff --git a/vector_etl/source_mods/zoho_crm_loader.py b/vector_etl/source_mods/zoho_crm_loader.py new file mode 100644 index 0000000..a768066 --- /dev/null +++ b/vector_etl/source_mods/zoho_crm_loader.py @@ -0,0 +1,153 @@ +from base import BaseSource +import pandas as pd +import requests +import logging +from pprint import pprint +import os +import json + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + + + + +class ZohoCrmSource(BaseSource): + def __init__(self,config): + self.config = config + self.token = None + self.url = None + self.grant_type = self.config['grant_type'] + self.client_id = self.config['client_id'] + self.client_secret = self.config['client_secret'] + self.code = self.config['code'] + self.accounts_url = self.config['accounts_url'] + + + def flatten_dict(self, d, parent_key='', sep='_'): + + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(self.flatten_dict(v, new_key, sep=sep).items()) + else: + items.append((new_key, v)) + return dict(items) + + + def connect(self): + + data = { + "grant_type":self.grant_type, + "client_id": self.client_id, + "client_secret": self.client_secret, + "code": self.code + } + try: + if os.path.exists("token.json"): + with open("token.json",'r') as token_file: + token_data = json.load(token_file) + self.token = token_data.get("access_token") + return self.token + else: + response = requests.post(url=self.accounts_url, data=data) + logger.info(f"Status {response.status_code}") + with open("token.json", 'w') as token_file: + json.dump({"access_token": response.json()["access_token"]}, token_file) + + logger.info("New token fetched and saved.") + tokens = response.json()["access_token"] + return tokens + except requests.exceptions.HTTPError as http_err: + logger.error(f"connection Error {http_err}") + + + + def fetch_data(self): + + self.token = self.connect() + if self.config['records'] == 'module.Contacts': + logger.info("Contact \n") + self.url = f"""https://www.zohoapis.com/crm/v5/Contacts?fields=Acount_Name, + First_Name,Lead_Source,Home,Fax,Skype_ID,Asst_Phone,Phone, + Title,Department,Twitter,Last_Name,Contact_Name,Phone,Email,Reporting_To, + Mailing_Street,Mailing_City,Mailing_State,Mailing_Zip,Mailing_Country, + Description,Contact_Owner,Lead_Source,Date_of_Birth,Contact_Image + &converted=true&per_page={self.config['per_page']}""" + + elif self.config['records'] == 'module.Accounts': + logger.info("Accounts \n") + self.url = f"""https://www.zohoapis.com/crm/v5/Accounts?fields=Account_Owner,Account_Name,Account_Site,Parent_Account, + Account_Number,Account_Type,Industry,Annual_Revenue,Rating,Phone,Fax,Website,Ticker_Symbol,OwnerShip,Employees,Sic_Code, + Billing_Street,Billing_City,Billing_State,Billing_Code,Billing_Country,Shipping_Street,Shipping_City,Shipping_State,Shipping_Code, + Shipping_Country,Description + &converted=true&per_page={self.config['per_page']}""" + + + + + elif self.config['records'] == 'module.Leads': + logger.info("Leads \n") + self.url = f"""https://www.zohoapis.com/crm/v5/Leads?fields=Lead_Owner,First_Name,Title,Mobile,Lead_Source, + Industry,Annual_Revenue,Company,Last_Name,Email,Fax,Website,Lead_Status,Rating,Skype_ID, + Description,Twitter,City,Street,State,Country,Zip_Code,No_of_Employees + &converted=true&per_page={self.config['per_page']}""" + + elif self.config['records'] == 'module.Deals': + logger.info("Deals \n") + self.url = f"""https://www.zohoapis.com/crm/v5/Deals?fields=Deal_Owner,Deal_Name,Account_Name, + Type,Next_Step,Lead_Source,Contact_Name,Amount,Closing_Date,Stage,Probability,Expected_Revenue, + Campaign_Source,Description + &converted=true&per_page={self.config['per_page']}""" + + + + elif self.config['records'] == 'module.Campaigns': + logger.info("Campaigns \n") + self.url = f"""https://www.zohoapis.com/crm/v5/Campaigns?fields=Campaign_Owner,Campaign_Name,Start_Date, + Expected_Revenue,Actual_Cost,Number_sent,Type,Status,End_Date,Budgeted_Cost,Expected_Response,Description + &converted=true&per_page={self.config['per_page']}""" + + + + + elif self.config['records'] == 'module.Tasks': + logger.info("Tasks \n") + self.url = f"""https://www.zohoapis.com/crm/v5/Tasks?fields=Task_Owner,Subject,Due_Date,Contact,Deal,Status,Priority,Reminder, + Repeat,Description + &converted=true&per_page={self.config['per_page']}""" + + + + elif self.config['records'] == 'module.Calls': + logger.info("Calls \n") + self.url = f"""https://www.zohoapis.com/crm/v5/Calls?fields=Call_To,Related_To,Call_Type,Outgoing_Call_Status, + Call_Start_Time,Call_Owner,Subject,Created_By,Modified_By,Call_Purpose,Call_Agenda&converted=true&per_page={self.config['per_page']}""" + + + elif self.config['records'] == 'module': + logger.info("Calls \n") + self.url = "https://www.zohoapis.com/crm/v5/settings/modules," + + headers = {"Authorization":f"Zoho-oauthtoken {self.token}"} + + response = requests.get(url=self.url,headers=headers).json()['data'] + + flattened_data = [self.flatten_dict(item) for item in response] + + + df = pd.DataFrame(flattened_data ) + + logger.info(f" data \n {df}") + + return df + + + + + + + From fb47e0255df9fd495b52383b7688840b5e5e85aa Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Sat, 24 Aug 2024 20:31:41 -0800 Subject: [PATCH 09/34] added zoho desk --- vector_etl/source_mods/__init__.py | 3 + vector_etl/source_mods/zoho_crm_loader.py | 11 +- vector_etl/source_mods/zoho_desk_loader.py | 130 +++++++++++++++++++++ 3 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 vector_etl/source_mods/zoho_desk_loader.py diff --git a/vector_etl/source_mods/__init__.py b/vector_etl/source_mods/__init__.py index 3014119..399f4f2 100644 --- a/vector_etl/source_mods/__init__.py +++ b/vector_etl/source_mods/__init__.py @@ -13,6 +13,7 @@ from .google_bigquery import GoogleBigQuerySource from .hubspot_loader import HubSpotSource from .zoho_crm_loader import ZohoCrmSource +from .zoho_desk_loader import ZohoDeskSource def get_source_class(config): source_type = config['source_data_type'] if source_type == 'Amazon S3': @@ -41,5 +42,7 @@ def get_source_class(config): return HubSpotSource(config) elif source_type == 'Zoho Crm': return ZohoCrmSource(config) + elif source_type == 'Zoho Desk': + return ZohoDeskSource(config) else: raise ValueError(f"Unsupported source type: {source_type}") diff --git a/vector_etl/source_mods/zoho_crm_loader.py b/vector_etl/source_mods/zoho_crm_loader.py index a768066..b06c4db 100644 --- a/vector_etl/source_mods/zoho_crm_loader.py +++ b/vector_etl/source_mods/zoho_crm_loader.py @@ -1,4 +1,4 @@ -from base import BaseSource +from .base import BaseSource import pandas as pd import requests import logging @@ -147,6 +147,15 @@ def fetch_data(self): + + + + + + + + + diff --git a/vector_etl/source_mods/zoho_desk_loader.py b/vector_etl/source_mods/zoho_desk_loader.py new file mode 100644 index 0000000..f19cc54 --- /dev/null +++ b/vector_etl/source_mods/zoho_desk_loader.py @@ -0,0 +1,130 @@ +from .base import BaseSource +import pandas as pd +import requests +import logging +from pprint import pprint +import os +import json + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class ZohoDeskSource(BaseSource): + def __init__(self,config): + self.config = config + self.token = None + self.url = None + self.grant_type = self.config['grant_type'] + self.client_id = self.config['client_id'] + self.client_secret = self.config['client_secret'] + self.code = self.config['code'] + self.accounts_url = self.config['accounts_url'] + + + def flatten_dict(self, d, parent_key='', sep='_'): + + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(self.flatten_dict(v, new_key, sep=sep).items()) + else: + items.append((new_key, v)) + return dict(items) + + + def connect(self): + + data = { + "grant_type":self.grant_type, + "client_id": self.client_id, + "client_secret": self.client_secret, + "code": self.code + } + try: + if os.path.exists("token.json"): + with open("token.json",'r') as token_file: + token_data = json.load(token_file) + self.token = token_data.get("access_token") + return self.token + else: + response = requests.post(url=self.accounts_url, data=data) + logger.info(f"Status {response.status_code}") + with open("token.json", 'w') as token_file: + json.dump({"access_token": response.json()["access_token"]}, token_file) + + logger.info("New token fetched and saved.") + tokens = response.json()["access_token"] + return tokens + except requests.exceptions.HTTPError as http_err: + logger.error(f"connection Error {http_err}") + + + + def fetch_data(self): + + self.token = self.connect() + if self.config['records'] == 'desk.agents': + logger.info("Agents \n") + self.url = f"https://desk.zoho.com/api/v1/agents?limit={self.config['limit']}" + + + elif self.config['records'] == 'desk.team': + logger.info("Teams \n") + self.url = f"https://desk.zoho.com/api/v1/teams" + headers = {"Authorization":f"Zoho-oauthtoken {self.token}"} + + response = requests.get(url=self.url,headers=headers).json()['teams'] + + flattened_data = [self.flatten_dict(item) for item in response] + + + df = pd.DataFrame(flattened_data ) + + logger.info(f" data \n {df}") + + return df + + + + elif self.config['records'] == 'desk.ticket': + logger.info("Ticket \n") + self.url = f"""https://desk.zoho.com/api/v1/tickets?include=contacts, + assignee,departments,team,isRead&limit={self.config['limit']}""" + + + elif self.config['records'] == 'desk.contacts': + logger.info("Contact \n") + self.url = f"https://desk.zoho.com/api/v1/contacts?limit={self.config['limit']}" + + + try: + headers = {"Authorization":f"Zoho-oauthtoken {self.token}"} + + response = requests.get(url=self.url,headers=headers) + + flattened_data = [self.flatten_dict(item) for item in response] + + + df = pd.DataFrame(flattened_data ) + + logger.info(f" data \n {df}") + + return df + except requests.exceptions.HTTPError as http_err: + logger.error(f"HTTP error occurred: {http_err}") + + + + + + + + + + + + + From 07775e62e1cbe117e8518e29f0fc959ffcc05f40 Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Wed, 28 Aug 2024 10:30:34 -0800 Subject: [PATCH 10/34] added intercom source --- vector_etl/source_mods/__init__.py | 3 + vector_etl/source_mods/intercom_loader.py | 131 +++++++++++++++++++++ vector_etl/source_mods/zoho_desk_loader.py | 2 +- 3 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 vector_etl/source_mods/intercom_loader.py diff --git a/vector_etl/source_mods/__init__.py b/vector_etl/source_mods/__init__.py index 399f4f2..731988c 100644 --- a/vector_etl/source_mods/__init__.py +++ b/vector_etl/source_mods/__init__.py @@ -14,6 +14,7 @@ from .hubspot_loader import HubSpotSource from .zoho_crm_loader import ZohoCrmSource from .zoho_desk_loader import ZohoDeskSource +from . intercom_loader import InterComSource def get_source_class(config): source_type = config['source_data_type'] if source_type == 'Amazon S3': @@ -44,5 +45,7 @@ def get_source_class(config): return ZohoCrmSource(config) elif source_type == 'Zoho Desk': return ZohoDeskSource(config) + elif source_type == "InterCom": + return InterComSource(config) else: raise ValueError(f"Unsupported source type: {source_type}") diff --git a/vector_etl/source_mods/intercom_loader.py b/vector_etl/source_mods/intercom_loader.py new file mode 100644 index 0000000..b771a09 --- /dev/null +++ b/vector_etl/source_mods/intercom_loader.py @@ -0,0 +1,131 @@ +from base import BaseSource +import requests +from pprint import pprint +import logging +import pandas as pd + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class InterComSource(BaseSource): + def __init__(self,config): + self.config = config + self.url = None + self.token = self.config['token'] + + + def flatten_dict(self, d, parent_key='', sep='_'): + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(self.flatten_dict(v, new_key, sep=sep).items()) + elif isinstance(v, list): + for i, item in enumerate(v): + if isinstance(item, dict): + items.extend(self.flatten_dict(item, f"{new_key}{sep}{i}", sep=sep).items()) + else: + items.append((f"{new_key}{sep}{i}", item)) + else: + items.append((new_key, v)) + return dict(items) + + + def connect(self,url): + headers = {"Authorization": f"Bearer {self.token}", + "Content-type":"application/json", + "Intercom-Version":"2.11"} + response = requests.get(url=url,headers=headers) + + return response + + + def fetch_data(self): + + if self.config['records'] == 'intercom.articles': + logger.info(" articles \n") + self.url = f"https://api.intercom.io/articles" + + response = self.connect(self.url).json()['data'] + + + if self.config['records'] == 'intercom.companies.scroll': + logger.info(" Companies \n") + self.url = f"https://api.intercom.io/companies/scroll" + + response = self.connect(self.url).json()['data'] + + + if self.config['records'] == 'intercom.contacts': + logger.info(" Contacts \n") + self.url = f"https://api.intercom.io/contacts" + + response = self.connect(self.url).json()['data'] + + + + if self.config['records'] == 'intercom.conversations': + logger.info(" conversations \n") + self.url = f"https://api.intercom.io/conversations" + + response = self.connect(self.url).json()['conversations'] + + + + if self.config['records'] == 'intercom.collections': + logger.info(" collection \n") + self.url = f"https://api.intercom.io/help_center/collections" + + response = self.connect(self.url).json()['data'] + + + if self.config['records'] == 'intercom.news_items': + logger.info(" news items \n") + self.url = f"https://api.intercom.io/news/news_items" + + response = self.connect(self.url).json()['data'] + + + if self.config['records'] == 'intercom.segments': + logger.info(" segments \n") + self.url = f"https://api.intercom.io/segments" + + response = self.connect(self.url).json()['segments'] + + + + if self.config['records'] == 'intercom.subscription_types': + logger.info(" subscription_types \n") + self.url = f"https://api.intercom.io/subscription_types" + + response = self.connect(self.url).json()['data'] + + + if self.config['records'] == 'intercom.teams': + logger.info(" Teams \n") + self.url = f"https://api.intercom.io/teams" + + response = self.connect(self.url).json()['teams'] + + if self.config['records'] == 'intercom.ticket_types': + logger.info(" ticket_types \n") + self.url = f"https://api.intercom.io/ticket_types" + + response = self.connect(self.url).json()['data'] + + + try: + flattened_data = [self.flatten_dict(item) for item in response] + pprint(flattened_data,indent=4) + + df = pd.DataFrame(flattened_data ) + + logger.info(f" data \n {df}") + + return df + except requests.exceptions.HTTPError as http_err: + logger.error(f"HTTP error occurred: {http_err}") + + + + diff --git a/vector_etl/source_mods/zoho_desk_loader.py b/vector_etl/source_mods/zoho_desk_loader.py index f19cc54..6280c42 100644 --- a/vector_etl/source_mods/zoho_desk_loader.py +++ b/vector_etl/source_mods/zoho_desk_loader.py @@ -103,7 +103,7 @@ def fetch_data(self): try: headers = {"Authorization":f"Zoho-oauthtoken {self.token}"} - response = requests.get(url=self.url,headers=headers) + response = requests.get(url=self.url,headers=headers)['data'] flattened_data = [self.flatten_dict(item) for item in response] From 9005c5bdd09261d9a408d1c8481f79a4ec98f9b2 Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Wed, 28 Aug 2024 16:14:03 -0800 Subject: [PATCH 11/34] added paystack source --- vector_etl/source_mods/__init__.py | 5 +- vector_etl/source_mods/intercom_loader.py | 2 +- vector_etl/source_mods/paystack_loader.py | 112 ++++++++++++++++++++++ 3 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 vector_etl/source_mods/paystack_loader.py diff --git a/vector_etl/source_mods/__init__.py b/vector_etl/source_mods/__init__.py index 731988c..c2af60e 100644 --- a/vector_etl/source_mods/__init__.py +++ b/vector_etl/source_mods/__init__.py @@ -14,7 +14,8 @@ from .hubspot_loader import HubSpotSource from .zoho_crm_loader import ZohoCrmSource from .zoho_desk_loader import ZohoDeskSource -from . intercom_loader import InterComSource +from .intercom_loader import InterComSource +from .paystack_loader import PayStackSource def get_source_class(config): source_type = config['source_data_type'] if source_type == 'Amazon S3': @@ -47,5 +48,7 @@ def get_source_class(config): return ZohoDeskSource(config) elif source_type == "InterCom": return InterComSource(config) + elif source_type == 'PayStackS': + return PayStackSource(config) else: raise ValueError(f"Unsupported source type: {source_type}") diff --git a/vector_etl/source_mods/intercom_loader.py b/vector_etl/source_mods/intercom_loader.py index b771a09..745e951 100644 --- a/vector_etl/source_mods/intercom_loader.py +++ b/vector_etl/source_mods/intercom_loader.py @@ -1,4 +1,4 @@ -from base import BaseSource +from .base import BaseSource import requests from pprint import pprint import logging diff --git a/vector_etl/source_mods/paystack_loader.py b/vector_etl/source_mods/paystack_loader.py new file mode 100644 index 0000000..3900090 --- /dev/null +++ b/vector_etl/source_mods/paystack_loader.py @@ -0,0 +1,112 @@ +from .base import BaseSource +import requests +from pprint import pprint +import logging +import pandas as pd + +from paystackapi.paystack import Paystack + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class PayStackSource(BaseSource): + def __init__(self,config): + self.config = config + self.paystack_secret_key = self.config['token'] + + + def flatten_dict(self, d, parent_key='', sep='_'): + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(self.flatten_dict(v, new_key, sep=sep).items()) + elif isinstance(v, list): + for i, item in enumerate(v): + if isinstance(item, dict): + items.extend(self.flatten_dict(item, f"{new_key}{sep}{i}", sep=sep).items()) + else: + items.append((f"{new_key}{sep}{i}", item)) + else: + items.append((new_key, v)) + return dict(items) + + + def connect(self): + + paystack = Paystack(secret_key=self.paystack_secret_key) + + return paystack + + + def fetch_data(self): + + if self.config['records'] == 'paystack.transactions': + logger.info(" Transactions \n") + response = self.connect().transaction.list()['data'] + + elif self.config['records'] == 'paystack.transactions.split': + logger.info(" Transactions split \n") + response = self.connect().transactionSplit.list()['data'] + + elif self.config['records'] == 'paystack.invoice': + logger.info(" invoice \n") + response = self.connect().invoice.list()['data'] + + elif self.config['records'] == 'paystack.product': + logger.info(" product \n") + response = self.connect().product.list()['data'] + + elif self.config['records'] == 'paystack.customer': + logger.info(" customer \n") + response = self.connect().customer.list()['data'] + + elif self.config['records'] == 'paystack.plan': + logger.info(" plan \n") + response = self.connect().plan.list()['data'] + + elif self.config['records'] == 'paystack.subaccount': + logger.info(" subaccount \n") + response = self.connect().subaccount().list()['data'] + + + elif self.config['records'] == 'paystack.subscription': + logger.info(" subaccount \n") + response = self.connect().subscription.list()['data'] + + + elif self.config['records'] == 'paystack.transfer': + logger.info(" transfer \n") + response = self.connect().transfer.list()['data'] + + + elif self.config['records'] == 'paystack.bulkcharge': + logger.info(" bulkcharge \n") + response = self.connect().bulkcharge.list()['data'] + + + elif self.config['records'] == 'paystack.refund': + logger.info(" refund \n") + response = self.connect().refund.list()['data'] + + + try: + flattened_data = [self.flatten_dict(item) for item in response] + pprint(flattened_data,indent=4) + + df = pd.DataFrame(flattened_data ) + + logger.info(f" data \n {df}") + + return df + except requests.exceptions.HTTPError as http_err: + logger.error(f"HTTP error occurred: {http_err}") + + + + + + + + From 5c8e47b0bcf44ec6a11e6decb505dd601469bcec Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Wed, 28 Aug 2024 18:00:28 -0800 Subject: [PATCH 12/34] added flutterwave source --- vector_etl/source_mods/__init__.py | 3 + vector_etl/source_mods/flutterwave_loader.py | 113 +++++++++++++++++++ vector_etl/source_mods/intercom_loader.py | 18 +-- vector_etl/source_mods/paystack_loader.py | 7 +- 4 files changed, 127 insertions(+), 14 deletions(-) create mode 100644 vector_etl/source_mods/flutterwave_loader.py diff --git a/vector_etl/source_mods/__init__.py b/vector_etl/source_mods/__init__.py index c2af60e..285e9fb 100644 --- a/vector_etl/source_mods/__init__.py +++ b/vector_etl/source_mods/__init__.py @@ -16,6 +16,7 @@ from .zoho_desk_loader import ZohoDeskSource from .intercom_loader import InterComSource from .paystack_loader import PayStackSource +from .flutterwave_loader import FlutterWaveSource def get_source_class(config): source_type = config['source_data_type'] if source_type == 'Amazon S3': @@ -50,5 +51,7 @@ def get_source_class(config): return InterComSource(config) elif source_type == 'PayStackS': return PayStackSource(config) + elif source_type == "FlutterWave": + return FlutterWaveSource(config) else: raise ValueError(f"Unsupported source type: {source_type}") diff --git a/vector_etl/source_mods/flutterwave_loader.py b/vector_etl/source_mods/flutterwave_loader.py new file mode 100644 index 0000000..d49f70a --- /dev/null +++ b/vector_etl/source_mods/flutterwave_loader.py @@ -0,0 +1,113 @@ +from .base import BaseSource +import requests +from pprint import pprint +import logging +import pandas as pd + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class FlutterWaveSource(BaseSource): + def __init__(self,config): + self.config = config + self.url = None + self.secret_key = self.config['secret_key'] + + + def flatten_dict(self, d, parent_key='', sep='_'): + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(self.flatten_dict(v, new_key, sep=sep).items()) + elif isinstance(v, list): + for i, item in enumerate(v): + if isinstance(item, dict): + items.extend(self.flatten_dict(item, f"{new_key}{sep}{i}", sep=sep).items()) + else: + items.append((f"{new_key}{sep}{i}", item)) + else: + items.append((new_key, v)) + return dict(items) + + + def connect(self,url): + headers = {"Authorization": f"Bearer {self.secret_key}", + "Content-type":"application/json", + "Intercom-Version":"2.11"} + response = requests.get(url=url,headers=headers) + + return response + + + def fetch_data(self): + + if self.config['records'] == 'flutterwave.transfers': + logger.info(" Transfers \n") + self.url = f"https://api.flutterwave.com/v3/transfers" + + response = self.connect(self.url).json()['data'] + + elif self.config['records'] == 'flutterwave.transactions': + logger.info(" transactions \n") + self.url = f"https://api.flutterwave.com/v3/transactions" + + response = self.connect(self.url).json()['data'] + + + elif self.config['records'] == 'flutterwave.beneficiaries': + logger.info(" Transfers \n") + self.url = f"https://api.flutterwave.com/v3/beneficiaries" + + response = self.connect(self.url).json()['data'] + + + elif self.config['records'] == 'flutterwave.subaccounts': + logger.info(" subaccounts \n") + self.url = f"https://api.flutterwave.com/v3/subaccounts" + + response = self.connect(self.url).json()['data'] + + elif self.config['records'] == 'flutterwave.payout-subaccounts': + logger.info(" payout-subaccounts \n") + self.url = f"https://api.flutterwave.com/v3/payout-subaccounts" + + response = self.connect(self.url).json()['data'] + + elif self.config['records'] == 'flutterwave.subscriptions': + logger.info(" subscriptions \n") + self.url = f"https://api.flutterwave.com/v3/subscriptions" + + response = self.connect(self.url).json()['data'] + + + elif self.config['records'] == 'flutterwave.payment-plans': + logger.info(" payment-plans \n") + self.url = f"https://api.flutterwave.com/v3/payment-plans" + + response = self.connect(self.url).json()['data'] + + + + + + try: + flattened_data = [self.flatten_dict(item) for item in response] + pprint(flattened_data,indent=4) + + df = pd.DataFrame(flattened_data ) + + logger.info(f" data \n {df}") + + return df + except requests.exceptions.HTTPError as http_err: + logger.error(f"HTTP error occurred: {http_err}") + + + + + + + + + diff --git a/vector_etl/source_mods/intercom_loader.py b/vector_etl/source_mods/intercom_loader.py index 745e951..b3a8581 100644 --- a/vector_etl/source_mods/intercom_loader.py +++ b/vector_etl/source_mods/intercom_loader.py @@ -49,14 +49,14 @@ def fetch_data(self): response = self.connect(self.url).json()['data'] - if self.config['records'] == 'intercom.companies.scroll': + elif self.config['records'] == 'intercom.companies.scroll': logger.info(" Companies \n") self.url = f"https://api.intercom.io/companies/scroll" response = self.connect(self.url).json()['data'] - if self.config['records'] == 'intercom.contacts': + elif self.config['records'] == 'intercom.contacts': logger.info(" Contacts \n") self.url = f"https://api.intercom.io/contacts" @@ -64,7 +64,7 @@ def fetch_data(self): - if self.config['records'] == 'intercom.conversations': + elif self.config['records'] == 'intercom.conversations': logger.info(" conversations \n") self.url = f"https://api.intercom.io/conversations" @@ -72,21 +72,21 @@ def fetch_data(self): - if self.config['records'] == 'intercom.collections': + elif self.config['records'] == 'intercom.collections': logger.info(" collection \n") self.url = f"https://api.intercom.io/help_center/collections" response = self.connect(self.url).json()['data'] - if self.config['records'] == 'intercom.news_items': + elif self.config['records'] == 'intercom.news_items': logger.info(" news items \n") self.url = f"https://api.intercom.io/news/news_items" response = self.connect(self.url).json()['data'] - if self.config['records'] == 'intercom.segments': + elif self.config['records'] == 'intercom.segments': logger.info(" segments \n") self.url = f"https://api.intercom.io/segments" @@ -94,20 +94,20 @@ def fetch_data(self): - if self.config['records'] == 'intercom.subscription_types': + elif self.config['records'] == 'intercom.subscription_types': logger.info(" subscription_types \n") self.url = f"https://api.intercom.io/subscription_types" response = self.connect(self.url).json()['data'] - if self.config['records'] == 'intercom.teams': + elif self.config['records'] == 'intercom.teams': logger.info(" Teams \n") self.url = f"https://api.intercom.io/teams" response = self.connect(self.url).json()['teams'] - if self.config['records'] == 'intercom.ticket_types': + elif self.config['records'] == 'intercom.ticket_types': logger.info(" ticket_types \n") self.url = f"https://api.intercom.io/ticket_types" diff --git a/vector_etl/source_mods/paystack_loader.py b/vector_etl/source_mods/paystack_loader.py index 3900090..0fd7b8a 100644 --- a/vector_etl/source_mods/paystack_loader.py +++ b/vector_etl/source_mods/paystack_loader.py @@ -1,6 +1,4 @@ from .base import BaseSource -import requests -from pprint import pprint import logging import pandas as pd @@ -13,7 +11,7 @@ class PayStackSource(BaseSource): def __init__(self,config): self.config = config - self.paystack_secret_key = self.config['token'] + self.paystack_secret_key = self.config['paystack_secret_key'] def flatten_dict(self, d, parent_key='', sep='_'): @@ -93,14 +91,13 @@ def fetch_data(self): try: flattened_data = [self.flatten_dict(item) for item in response] - pprint(flattened_data,indent=4) df = pd.DataFrame(flattened_data ) logger.info(f" data \n {df}") return df - except requests.exceptions.HTTPError as http_err: + except Exception as http_err: logger.error(f"HTTP error occurred: {http_err}") From cc0bf928723e94f4c53622a298590787a32ae69c Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Fri, 30 Aug 2024 06:11:21 -0800 Subject: [PATCH 13/34] added sources flutterwave,hubspot,zoho,paystack --- vector_etl/source_mods/__init__.py | 4 ++-- vector_etl/source_mods/google_bigquery.py | 5 +---- vector_etl/source_mods/hubspot_loader.py | 2 +- vector_etl/source_mods/zoho_desk_loader.py | 16 +--------------- 4 files changed, 5 insertions(+), 22 deletions(-) diff --git a/vector_etl/source_mods/__init__.py b/vector_etl/source_mods/__init__.py index 285e9fb..96a4b1e 100644 --- a/vector_etl/source_mods/__init__.py +++ b/vector_etl/source_mods/__init__.py @@ -43,9 +43,9 @@ def get_source_class(config): return GoogleBigQuerySource(config) elif source_type == 'HubSpot': return HubSpotSource(config) - elif source_type == 'Zoho Crm': + elif source_type == 'ZohoCrm': return ZohoCrmSource(config) - elif source_type == 'Zoho Desk': + elif source_type == 'ZohoDesk': return ZohoDeskSource(config) elif source_type == "InterCom": return InterComSource(config) diff --git a/vector_etl/source_mods/google_bigquery.py b/vector_etl/source_mods/google_bigquery.py index 3a93a99..51e979d 100644 --- a/vector_etl/source_mods/google_bigquery.py +++ b/vector_etl/source_mods/google_bigquery.py @@ -41,8 +41,5 @@ def fetch_data(self): } -data = GoogleBigQuerySource(config) -print(type(data.fetch_data())) -print(data.fetch_data()) - + diff --git a/vector_etl/source_mods/hubspot_loader.py b/vector_etl/source_mods/hubspot_loader.py index 5b7d381..ca93421 100644 --- a/vector_etl/source_mods/hubspot_loader.py +++ b/vector_etl/source_mods/hubspot_loader.py @@ -43,7 +43,7 @@ def fetch_data(self): self.endpoints = f"https://api.hubapi.com/crm/v3/objects/deals?limit={self.config['limit']}&archived={self.config['archive']}" logger.info(f"Deals \n") - elif self.config['crm_object'] == "crm.products": + elif self.config['crm_object'] == "crm_object": self.endpoints = f"https://api.hubapi.com/crm/v3/objects/products?limit={self.config['limit']}&archived={self.config['archive']}" logger.info(f"products \n") diff --git a/vector_etl/source_mods/zoho_desk_loader.py b/vector_etl/source_mods/zoho_desk_loader.py index 6280c42..9a083b8 100644 --- a/vector_etl/source_mods/zoho_desk_loader.py +++ b/vector_etl/source_mods/zoho_desk_loader.py @@ -100,21 +100,7 @@ def fetch_data(self): self.url = f"https://desk.zoho.com/api/v1/contacts?limit={self.config['limit']}" - try: - headers = {"Authorization":f"Zoho-oauthtoken {self.token}"} - - response = requests.get(url=self.url,headers=headers)['data'] - - flattened_data = [self.flatten_dict(item) for item in response] - - - df = pd.DataFrame(flattened_data ) - - logger.info(f" data \n {df}") - - return df - except requests.exceptions.HTTPError as http_err: - logger.error(f"HTTP error occurred: {http_err}") + From 2d6f4ec03c9762fbab30406ca7f596a1e1d6eaac Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Fri, 30 Aug 2024 06:12:55 -0800 Subject: [PATCH 14/34] added sources flutterwave,hubspot,zoho,paystack --- .gitignore | 3 + examples/flutterwave_to_pinecone.yaml | 20 ++++ examples/hubspot_to_pinecone.yaml | 23 +++++ examples/intercom_to_pinecone.yaml | 20 ++++ examples/paystack_to_pincone.yaml | 20 ++++ examples/zohocrm_to_pinecone.yaml | 25 +++++ examples/zohodesk_to_pinecone.yaml | 25 +++++ tests/test_source_mods.py | 132 ++++++++++++++++++++++++++ 8 files changed, 268 insertions(+) create mode 100644 examples/flutterwave_to_pinecone.yaml create mode 100644 examples/hubspot_to_pinecone.yaml create mode 100644 examples/intercom_to_pinecone.yaml create mode 100644 examples/paystack_to_pincone.yaml create mode 100644 examples/zohocrm_to_pinecone.yaml create mode 100644 examples/zohodesk_to_pinecone.yaml diff --git a/.gitignore b/.gitignore index e41350c..20ebccf 100644 --- a/.gitignore +++ b/.gitignore @@ -154,6 +154,9 @@ dmypy.json # Cython debug symbols cython_debug/ +vector_etl/source_mods/google_bigquery.py +vector_etl/source_mods/paystack_loader.py + # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore diff --git a/examples/flutterwave_to_pinecone.yaml b/examples/flutterwave_to_pinecone.yaml new file mode 100644 index 0000000..8e93e83 --- /dev/null +++ b/examples/flutterwave_to_pinecone.yaml @@ -0,0 +1,20 @@ +source: + source_data_type: "FlutterWave" + secret_key: "" + records: "flutterwave.payout-subaccounts" + +embedding: + embedding_model: "OpenAI" + api_key: "" + model_name: "text-embedding-ada-002" + +target: + target_database: "Pinecone" + pinecone_api_key: "" + index_name: "" + dimension: 1536 #[Optional] Only required if creating a new index + metric: "cosine" #[Optional] Only required if creating a new index + cloud: "aws" #[Optional] Only required if creating a new index + region: "us-east-1" #[Optional] Only required if creating a new index + + diff --git a/examples/hubspot_to_pinecone.yaml b/examples/hubspot_to_pinecone.yaml new file mode 100644 index 0000000..aaaff75 --- /dev/null +++ b/examples/hubspot_to_pinecone.yaml @@ -0,0 +1,23 @@ +source: + source_data_type: "HubSpot" + archive: "" + limit: "" + client_secret: "" + access_token: "" + crm_object: "" + +embedding: + embedding_model: "OpenAI" + api_key: "" + model_name: "text-embedding-ada-002" + +target: + target_database: "Pinecone" + pinecone_api_key: "" + index_name: "" + dimension: 1536 #[Optional] Only required if creating a new index + metric: "cosine" #[Optional] Only required if creating a new index + cloud: "aws" #[Optional] Only required if creating a new index + region: "us-east-1" #[Optional] Only required if creating a new index + + diff --git a/examples/intercom_to_pinecone.yaml b/examples/intercom_to_pinecone.yaml new file mode 100644 index 0000000..55195d6 --- /dev/null +++ b/examples/intercom_to_pinecone.yaml @@ -0,0 +1,20 @@ +source: + source_data_type: "InterCom" + token: "FlutterWave" + records: "intercom.teams" + +embedding: + embedding_model: "OpenAI" + api_key: "" + model_name: "text-embedding-ada-002" + +target: + target_database: "Pinecone" + pinecone_api_key: "" + index_name: "" + dimension: 1536 #[Optional] Only required if creating a new index + metric: "cosine" #[Optional] Only required if creating a new index + cloud: "aws" #[Optional] Only required if creating a new index + region: "us-east-1" #[Optional] Only required if creating a new index + + diff --git a/examples/paystack_to_pincone.yaml b/examples/paystack_to_pincone.yaml new file mode 100644 index 0000000..b301a73 --- /dev/null +++ b/examples/paystack_to_pincone.yaml @@ -0,0 +1,20 @@ +source: + source_data_type: "PayStackS" + paystack_secret_key: "" + records: "paystack.transactions" + +embedding: + embedding_model: "OpenAI" + api_key: "" + model_name: "text-embedding-ada-002" + +target: + target_database: "Pinecone" + pinecone_api_key: "" + index_name: "" + dimension: 1536 #[Optional] Only required if creating a new index + metric: "cosine" #[Optional] Only required if creating a new index + cloud: "aws" #[Optional] Only required if creating a new index + region: "us-east-1" #[Optional] Only required if creating a new index + + diff --git a/examples/zohocrm_to_pinecone.yaml b/examples/zohocrm_to_pinecone.yaml new file mode 100644 index 0000000..8d55cb1 --- /dev/null +++ b/examples/zohocrm_to_pinecone.yaml @@ -0,0 +1,25 @@ +source: + source_data_type: "ZohoCrm" + grant_type: "" + client_id: "" + client_secret: "" + code: "" + per_page: "" + records: "" + accounts_url: "" + +embedding: + embedding_model: "OpenAI" + api_key: "" + model_name: "text-embedding-ada-002" + +target: + target_database: "Pinecone" + pinecone_api_key: "" + index_name: "" + dimension: 1536 #[Optional] Only required if creating a new index + metric: "cosine" #[Optional] Only required if creating a new index + cloud: "aws" #[Optional] Only required if creating a new index + region: "us-east-1" #[Optional] Only required if creating a new index + + diff --git a/examples/zohodesk_to_pinecone.yaml b/examples/zohodesk_to_pinecone.yaml new file mode 100644 index 0000000..8d408b3 --- /dev/null +++ b/examples/zohodesk_to_pinecone.yaml @@ -0,0 +1,25 @@ +source: + source_data_type: "ZohoDesk" + grant_type: "" + client_id: "" + client_secret: "" + code: "" + per_page: "" + records: "" + accounts_url: "" + +embedding: + embedding_model: "OpenAI" + api_key: "" + model_name: "text-embedding-ada-002" + +target: + target_database: "Pinecone" + pinecone_api_key: "" + index_name: "" + dimension: 1536 #[Optional] Only required if creating a new index + metric: "cosine" #[Optional] Only required if creating a new index + cloud: "aws" #[Optional] Only required if creating a new index + region: "us-east-1" #[Optional] Only required if creating a new index + + diff --git a/tests/test_source_mods.py b/tests/test_source_mods.py index c83dfe5..0b74b4c 100644 --- a/tests/test_source_mods.py +++ b/tests/test_source_mods.py @@ -7,6 +7,12 @@ from vector_etl.source_mods.local_file import LocalFileSource from vector_etl.source_mods.google_bigquery import GoogleBigQuerySource from vector_etl.source_mods.airtable_loader import AirTableSource +from vector_etl.source_mods.hubspot_loader import HubSpotSource +from vector_etl.source_mods.intercom_loader import InterComSource +from vector_etl.source_mods.paystack_loader import PayStackSource +from vector_etl.source_mods.zoho_crm_loader import ZohoCrmSource +from vector_etl.source_mods.zoho_desk_loader import ZohoDeskSource +from vector_etl.source_mods.flutterwave_loader import FlutterWaveSource @pytest.fixture def s3_config(): @@ -39,6 +45,66 @@ def airtable_config(): } +@pytest.fixture +def zohodesk_config(): + return{ + "grant_type":"", + "client_id": "", + "client_secret": "", + "code": "", + "limit":"", + "records":"desk.team", + "accounts_url":"" + } + + +@pytest.fixture +def zohocrm_config(): + return{ + "grant_type":"", + "client_id": "", + "client_secret": "", + "code": "", + "per_page":"10", + "records":"module.Call", + "accounts_url":"" + } + + +@pytest.fixture +def hubspot_config(): + return{ + "archive":"", + "limit": "", + "access_token": "", + "crm_object":"crm_object", + } + + +@pytest.fixture +def paystack_config(): + return{ + "paystack_secret_key":"", + "records": "paystack.transactions", + } + + + +@pytest.fixture +def flutterwave_config(): + return{ + "secret_key":"", + "records": "flutterwave.payout-subaccounts", + } + +@pytest.fixture +def intercom_config(): + return{ + "token":"", + "records": "intercom.teams", + } + + @pytest.fixture def db_config(): return { @@ -175,5 +241,71 @@ def test_airtable_fetch_data(airtable_config): df = source.fetch_data() assert isinstance(df, pd.DataFrame) + + + + +def test_zohodesk_fetch_data(zohodesk_config): + with patch('requests.get') as mock_connect: + mock_connect.return_value = [ { + "Address": "333 Post St", + "Name": "Union Square", + "Visited": True + } + ] + + source = ZohoDeskSource(db_config) + df = source.fetch_data() + + assert isinstance(df, pd.DataFrame) + + + + +def test_zohocrm_fetch_data(zohocrm_config): + with patch('requests.get') as mock_connect: + mock_connect.return_value = [ { } + ] + + source = ZohoCrmSource(db_config) + df = source.fetch_data() + + assert isinstance(df, pd.DataFrame) + + + + +def test_paystack_fetch_data(paystack_config): + with patch('Paystack') as mock_connect: + mock_connect.return_value = [{}] + + source = PayStackSource(db_config) + df = source.fetch_data() + + assert isinstance(df, pd.DataFrame) + + + + +def test_intercom_fetch_data(intercom_config): + with patch('requests.get') as mock_connect: + mock_connect.return_value = [{}] + + source = InterComSource(db_config) + df = source.fetch_data() + + assert isinstance(df, pd.DataFrame) + + + +def test_flutterwave_fetch_data(flutterwave_config): + with patch('requests.get') as mock_connect: + mock_connect.return_value = [{}] + + source = FlutterWaveSource(db_config) + df = source.fetch_data() + + assert isinstance(df, pd.DataFrame) + From 8c4f8fbb650c7e86c65674a4dbd055e3ec07d8d3 Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Sat, 31 Aug 2024 02:26:12 -0800 Subject: [PATCH 15/34] add 6 sources --- .gitignore | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 20ebccf..60993bb 100644 --- a/.gitignore +++ b/.gitignore @@ -154,8 +154,7 @@ dmypy.json # Cython debug symbols cython_debug/ -vector_etl/source_mods/google_bigquery.py -vector_etl/source_mods/paystack_loader.py + # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can From 86764ebbe929f4a83e8b4ea5173a70f12c89dfd3 Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Sun, 1 Sep 2024 06:15:15 -0800 Subject: [PATCH 16/34] added gmail source --- vector_etl/source_mods/__init__.py | 3 + vector_etl/source_mods/gmail_loader.py | 102 +++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 vector_etl/source_mods/gmail_loader.py diff --git a/vector_etl/source_mods/__init__.py b/vector_etl/source_mods/__init__.py index 96a4b1e..c2cbf14 100644 --- a/vector_etl/source_mods/__init__.py +++ b/vector_etl/source_mods/__init__.py @@ -17,6 +17,7 @@ from .intercom_loader import InterComSource from .paystack_loader import PayStackSource from .flutterwave_loader import FlutterWaveSource +from .gmail_loader import GmailSource def get_source_class(config): source_type = config['source_data_type'] if source_type == 'Amazon S3': @@ -53,5 +54,7 @@ def get_source_class(config): return PayStackSource(config) elif source_type == "FlutterWave": return FlutterWaveSource(config) + elif source_type == "Gmail": + return GmailSource(config) else: raise ValueError(f"Unsupported source type: {source_type}") diff --git a/vector_etl/source_mods/gmail_loader.py b/vector_etl/source_mods/gmail_loader.py new file mode 100644 index 0000000..9858bd0 --- /dev/null +++ b/vector_etl/source_mods/gmail_loader.py @@ -0,0 +1,102 @@ +import os.path +import base64 +from google.auth.transport.requests import Request +from google.oauth2.credentials import Credentials +from google_auth_oauthlib.flow import InstalledAppFlow +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError +from pprint import pprint +from .base import BaseSource +import pandas as pd + +class GmailSource(BaseSource): + def __init__(self, config): + self.config = config + self.SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"] + + def connect(self): + creds = None + if os.path.exists("token.json"): + creds = Credentials.from_authorized_user_file("token.json", self.SCOPES) + if not creds or not creds.valid: + if creds and creds.expired and creds.refresh_token: + creds.refresh(Request()) + else: + flow = InstalledAppFlow.from_client_secrets_file( + self.config['credentials'], self.SCOPES + ) + creds = flow.run_local_server(port=0) + with open("token.json", "w") as token: + token.write(creds.to_json()) + return creds + + def fetch_data(self): + creds = self.connect() + service = build("gmail", "v1", credentials=creds) + + # Extract the label from config + label = self.config.get('gmail.label').upper() + + messages = self.get_messages(service, label) + + if messages: + email_data = self.parse_messages(service, messages, label) + df = pd.DataFrame(email_data) + + return df + else: + print("No messages found.") + return None + + def get_messages(self, service, label): + try: + results = service.users().messages().list(userId="me", labelIds=[label]).execute() + return results.get("messages", []) + except HttpError as error: + print(f"An error occurred while fetching messages for label {label}: {error}") + return None + + def parse_messages(self, service, messages, label): + email_data = { + "id": [], + "threadId": [], + "label": [], + "subject": [], + "from": [], + "snippet": [], + "body": [], + } + + for message in messages: + msg = service.users().messages().get(userId="me", id=message["id"]).execute() + headers = msg["payload"]["headers"] + + subject, sender = self._get_header_info(headers) + snippet = msg.get("snippet", "") + body = self.get_body(msg) + + email_data["id"].append(message["id"]) + email_data["threadId"].append(message["threadId"]) + email_data["label"].append(label) + email_data["subject"].append(subject) + email_data["from"].append(sender) + email_data["snippet"].append(snippet) + email_data["body"].append(body) + + return email_data + + def get_header_info(self, headers): + subject = None + sender = None + for header in headers: + if header["name"] == "Subject": + subject = header["value"] + if header["name"] == "From": + sender = header["value"] + return subject, sender + + def get_body(self, msg): + if "data" in msg["payload"]["body"]: + return base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8") + else: + return "" From 5155ae521a529c79ce313ed511e0c8155753ee59 Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Sun, 1 Sep 2024 06:17:41 -0800 Subject: [PATCH 17/34] added gmail source --- examples/gmail_to_pinecone.yaml | 20 ++++++ tests/test_source_mods.py | 106 +++++++++++++++++++++++++++----- 2 files changed, 110 insertions(+), 16 deletions(-) create mode 100644 examples/gmail_to_pinecone.yaml diff --git a/examples/gmail_to_pinecone.yaml b/examples/gmail_to_pinecone.yaml new file mode 100644 index 0000000..794db28 --- /dev/null +++ b/examples/gmail_to_pinecone.yaml @@ -0,0 +1,20 @@ +source: + source_data_type: "Gmail" + credentisla: "" + gmail.label: 'IMPORTANT' + +embedding: + embedding_model: "OpenAI" + api_key: "" + model_name: "text-embedding-ada-002" + +target: + target_database: "Pinecone" + pinecone_api_key: "" + index_name: "" + dimension: 1536 #[Optional] Only required if creating a new index + metric: "cosine" #[Optional] Only required if creating a new index + cloud: "aws" #[Optional] Only required if creating a new index + region: "us-east-1" #[Optional] Only required if creating a new index + + diff --git a/tests/test_source_mods.py b/tests/test_source_mods.py index 0b74b4c..58f066d 100644 --- a/tests/test_source_mods.py +++ b/tests/test_source_mods.py @@ -13,6 +13,7 @@ from vector_etl.source_mods.zoho_crm_loader import ZohoCrmSource from vector_etl.source_mods.zoho_desk_loader import ZohoDeskSource from vector_etl.source_mods.flutterwave_loader import FlutterWaveSource +from vector_etl.source_mods.gmail_loader import GmailSource @pytest.fixture def s3_config(): @@ -44,6 +45,16 @@ def airtable_config(): "tableIdOrName":"survey" } + +@pytest.fixture +def gmail_config(): + return { + 'credentials': 'credentials.json', ## path to gmail crendtials + 'gmail.label': 'IMPORTANT' # Specify the label in the config + } + + + @pytest.fixture def zohodesk_config(): @@ -203,17 +214,11 @@ def test_google_bigquery_connect(google_bigquery_config): def test_google_bigquery_fetch_data(google_bigquery_config): with patch('bigquery.connect') as mock_connect: mock_connect.result.to_dataframe.return_value = pd.DataFrame() - source = GoogleBigQuerySource(db_config) + source = GoogleBigQuerySource(google_bigquery_config) df = source.fetch_data() assert isinstance(df, pd.DataFrame) - - - - - - def test_airtable_connect(airtable_config): with patch('requests.get') as mock_connect: @@ -237,12 +242,24 @@ def test_airtable_fetch_data(airtable_config): } ] - source = AirTableSource(db_config) + source = AirTableSource(airtable_config) df = source.fetch_data() assert isinstance(df, pd.DataFrame) - + +def test_zohodesk_connect(zohodesk_config): + + with patch('requests.get') as mock_connect: + source = ZohoDeskSource(zohodesk_config) + source.connect() + mock_connect.assert_called_once_with( + grant_type="", + client_id = "", + client_secret="", + code="", + accounts_url="" + ) def test_zohodesk_fetch_data(zohodesk_config): @@ -254,12 +271,23 @@ def test_zohodesk_fetch_data(zohodesk_config): } ] - source = ZohoDeskSource(db_config) + source = ZohoDeskSource(zohodesk_config) df = source.fetch_data() assert isinstance(df, pd.DataFrame) - +def test_zohocrm_connect(zohocrm_config): + + with patch('requests.get') as mock_connect: + source = ZohoCrmSource(zohocrm_config) + source.connect() + mock_connect.assert_called_once_with( + grant_type="", + client_id = "", + client_secret="", + code="", + accounts_url="" + ) def test_zohocrm_fetch_data(zohocrm_config): @@ -267,42 +295,88 @@ def test_zohocrm_fetch_data(zohocrm_config): mock_connect.return_value = [ { } ] - source = ZohoCrmSource(db_config) + source = ZohoCrmSource(zohocrm_config) df = source.fetch_data() assert isinstance(df, pd.DataFrame) - +def test_paystack_connect(paystack_config): + + with patch('requests.get') as mock_connect: + source = PayStackSource(paystack_config) + source.connect() + mock_connect.assert_called_once_with( + paystack_secret_key="", + ) def test_paystack_fetch_data(paystack_config): with patch('Paystack') as mock_connect: mock_connect.return_value = [{}] - source = PayStackSource(db_config) + source = PayStackSource(paystack_config) df = source.fetch_data() assert isinstance(df, pd.DataFrame) +def test_intercom_connect(intercom_config): + + with patch('requests.get') as mock_connect: + source = InterComSource(intercom_config) + source.connect() + mock_connect.assert_called_once_with( + secret_key="", + ) def test_intercom_fetch_data(intercom_config): with patch('requests.get') as mock_connect: mock_connect.return_value = [{}] - source = InterComSource(db_config) + source = InterComSource(intercom_config) df = source.fetch_data() assert isinstance(df, pd.DataFrame) + + +def test_flutterwave_connect(flutterwave_config): + + with patch('requests.get') as mock_connect: + source = FlutterWaveSource(flutterwave_config) + source.connect() + mock_connect.assert_called_once_with( + secret_key="", + ) def test_flutterwave_fetch_data(flutterwave_config): with patch('requests.get') as mock_connect: mock_connect.return_value = [{}] - source = FlutterWaveSource(db_config) + source = FlutterWaveSource(flutterwave_config) + df = source.fetch_data() + + assert isinstance(df, pd.DataFrame) + + + + +def test_gmail_connect(gmail_config): + + with patch('InstalledAppFlow.from_client_secrets_file') as mock_connect: + source = GmailSource(gmail_config) + source.connect() + mock_connect.assert_called_once_with( + credentials="credential.json", + ) + + +def test_gmail_fetch_data(gmail_config): + with patch('requests.get') as mock_connect: + mock_connect.return_value = [{}] + source = GmailSource(gmail_config) df = source.fetch_data() assert isinstance(df, pd.DataFrame) From 1474ab3b28b173eeb53dacdc2960e585a9f8966a Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Tue, 3 Sep 2024 07:12:42 -0800 Subject: [PATCH 18/34] added mailchipsource --- examples/mailchimp_to_pinecone.yaml | 21 +++++ tests/test_source_mods.py | 32 +++++++ vector_etl/source_mods/__init__.py | 4 + vector_etl/source_mods/mailchimp_loader.py | 100 +++++++++++++++++++++ 4 files changed, 157 insertions(+) create mode 100644 examples/mailchimp_to_pinecone.yaml create mode 100644 vector_etl/source_mods/mailchimp_loader.py diff --git a/examples/mailchimp_to_pinecone.yaml b/examples/mailchimp_to_pinecone.yaml new file mode 100644 index 0000000..8fd7087 --- /dev/null +++ b/examples/mailchimp_to_pinecone.yaml @@ -0,0 +1,21 @@ +source: + source_data_type: "MailChimp" + api_key: "" + server_prefix: "us13" + records: "ConnectedSites" + +embedding: + embedding_model: "OpenAI" + api_key: "" + model_name: "text-embedding-ada-002" + +target: + target_database: "Pinecone" + pinecone_api_key: "" + index_name: "" + dimension: 1536 #[Optional] Only required if creating a new index + metric: "cosine" #[Optional] Only required if creating a new index + cloud: "aws" #[Optional] Only required if creating a new index + region: "us-east-1" #[Optional] Only required if creating a new index + + diff --git a/tests/test_source_mods.py b/tests/test_source_mods.py index 58f066d..d9eb027 100644 --- a/tests/test_source_mods.py +++ b/tests/test_source_mods.py @@ -14,6 +14,7 @@ from vector_etl.source_mods.zoho_desk_loader import ZohoDeskSource from vector_etl.source_mods.flutterwave_loader import FlutterWaveSource from vector_etl.source_mods.gmail_loader import GmailSource +from vector_etl.source_mods.mailchimp_loader import MailChimpMarketingSource @pytest.fixture def s3_config(): @@ -138,6 +139,15 @@ def local_file_config(): 'chunk_overlap': 200 } + +@pytest.fixture +def mailchimp_config(): + return { + 'api_key': 'test_key', + 'server': 'test_secret', + 'records': 'test_bucket', + } + def test_s3_source_connect(s3_config): with patch('boto3.client') as mock_client: source = S3Source(s3_config) @@ -380,6 +390,28 @@ def test_gmail_fetch_data(gmail_config): df = source.fetch_data() assert isinstance(df, pd.DataFrame) + + + + +def test_mailchimp_connect(mailchimp_config): + + with patch('MailchimpMarketing.Client.set_config') as mock_connect: + source = MailChimpMarketingSource(mailchimp_config) + source.connect() + mock_connect.assert_called_once_with( + api_key="", + server="" + ) + + +def test_mailchimp_fetch_data(mailchimp_config): + with patch('MailchimpMarketing.Client.set_config') as mock_connect: + mock_connect.return_value = [{}] + source = MailChimpMarketingSource(mailchimp_config) + df = source.fetch_data() + + assert isinstance(df, pd.DataFrame) diff --git a/vector_etl/source_mods/__init__.py b/vector_etl/source_mods/__init__.py index c2cbf14..c6d3b7f 100644 --- a/vector_etl/source_mods/__init__.py +++ b/vector_etl/source_mods/__init__.py @@ -18,6 +18,8 @@ from .paystack_loader import PayStackSource from .flutterwave_loader import FlutterWaveSource from .gmail_loader import GmailSource +from .mailchimp_loader import MailChimpMarketingSource + def get_source_class(config): source_type = config['source_data_type'] if source_type == 'Amazon S3': @@ -56,5 +58,7 @@ def get_source_class(config): return FlutterWaveSource(config) elif source_type == "Gmail": return GmailSource(config) + elif source_type == "MailChimp": + return MailChimpMarketingSource(config) else: raise ValueError(f"Unsupported source type: {source_type}") diff --git a/vector_etl/source_mods/mailchimp_loader.py b/vector_etl/source_mods/mailchimp_loader.py new file mode 100644 index 0000000..4277c58 --- /dev/null +++ b/vector_etl/source_mods/mailchimp_loader.py @@ -0,0 +1,100 @@ +from base import BaseSource +import mailchimp_marketing as MailchimpMarketing +from mailchimp_marketing.api_client import ApiClientError +import pandas as pd +import logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) +from pprint import pprint + + + +class MailChimpMarketingSource(BaseSource): + def __init__(self,config): + self.config = config + self.api_key = self.config['api_key'] + self.server_prefix = self.config['server_prefix'] + + + def flatten_dict(self, d, parent_key='', sep='_'): + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(self.flatten_dict(v, new_key, sep=sep).items()) + elif isinstance(v, list): + for i, item in enumerate(v): + if isinstance(item, dict): + items.extend(self.flatten_dict(item, f"{new_key}{sep}{i}", sep=sep).items()) + else: + items.append((f"{new_key}{sep}{i}", item)) + else: + items.append((new_key, v)) + return dict(items) + + + def connect(self): + try: + client = MailchimpMarketing.Client() + client.set_config({ + "api_key":self.api_key, + "server": self.server_prefix + }) + return client + except ApiClientError as error: + print("Error: {}".format(error.text)) + + + + + + def fetch_data(self): + client = self.connect() + if self.config['records'] == "campaign": + response = client.campaigns.list()['campaigns'] + + elif self.config['records'] == "campaignFolders": + response = client.campaignFolders.list()['folders'] + + + elif self.config['records'] == "ConnectedSites": + response = client.connectedSites.list()['sites'] + + + elif self.config['records'] == "conversations": + response = client.ecommerce.stores()['conversations'] + + + elif self.config['records'] == "ecommerce": + response = client.conversations.list()['stores'] + + elif self.config['records'] == "facebookAds": + response = client.facebookAds.list()['facebook_ads'] + + elif self.config['records'] == "landingpages": + response = client.landingPages.get_all()['landing_pages'] + + + elif self.config['records'] == "reports": + response = client.reports.get_all_campaign_reports()['reports'] + + + + try: + flattened_data = [self.flatten_dict(item) for item in response] + pprint(flattened_data,indent=4) + + df = pd.DataFrame(flattened_data ) + + logger.info(f" data \n {df}") + + return df + except ApiClientError as error: + logger.error(f"HTTP error occurred: {error.text}") + + + + + + + From d21089ad481b8f20167fcf703b6b1496f6b8ea0b Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Fri, 6 Sep 2024 17:33:02 -0800 Subject: [PATCH 19/34] update requirements.txt file --- .gitignore | 1 + requirements.txt | 280 +++++++++++++++++---- vector_etl/source_mods/mailchimp_loader.py | 2 +- 3 files changed, 239 insertions(+), 44 deletions(-) diff --git a/.gitignore b/.gitignore index 60993bb..9d5a561 100644 --- a/.gitignore +++ b/.gitignore @@ -176,6 +176,7 @@ vector_etl/tempfile_downloads/ *_bkp.py vector_etl/source_mods/backup/ vector_etl/target_mods/backup/ +vector_etl/source_mods/bitbucket_loader.py # Additional files .DS_Store diff --git a/requirements.txt b/requirements.txt index 6baad92..4ff3cc3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,43 +1,237 @@ -boto3 -botocore -cohere -cffi -openai -psycopg2-binary -pinecone-client -requests -tiktoken -python-dotenv -pydantic -mysql-connector-python -pymysql -pandas -qdrant-client -singlestoredb -weaviate-client -azure-storage-blob -google-cloud-storage -snowflake-connector-python -stripe -vecs -simple-salesforce -google-generativeai -anthropic -pympler -unstructured[all-docs] -dropbox -zenpy -lancedb -pyyaml -google-auth -google-auth-oauthlib -google-auth-httplib2 -google-api-python-client -unstructured-client -box-sdk-gen -pymongo -neo4j -python-magic -pytest -nltk -pymilvus +annotated-types==0.7.0 +anthropic==0.32.0 +antlr4-python3-runtime==4.9.3 +anyio==4.4.0 +asn1crypto==1.5.1 +attrs==24.2.0 +Authlib==1.3.1 +azure-core==1.30.2 +azure-storage-blob==12.22.0 +backoff==2.2.1 +backports.zoneinfo==0.2.1 +beautifulsoup4==4.12.3 +boto3==1.34.156 +botocore==1.34.156 +box-sdk-gen==1.2.0 +build==1.2.1 +cachetools==5.4.0 +cassandra-driver==3.29.1 +certifi==2024.7.4 +cffi==1.17.0 +chardet==5.2.0 +charset-normalizer==3.3.2 +click==8.1.7 +cohere==5.6.2 +colorama==0.4.6 +coloredlogs==15.0.1 +confuse==2.0.1 +contourpy==1.1.1 +cryptography==42.0.8 +cycler==0.12.1 +dataclasses-json==0.6.7 +db-dtypes==1.2.0 +decorator==5.1.1 +deepdiff==7.0.1 +Deprecated==1.2.14 +deprecation==2.1.0 +distro==1.9.0 +dnspython==2.6.1 +dropbox==12.0.2 +effdet==0.4.1 +emoji==2.12.1 +enum-compat==0.0.3 +et-xmlfile==1.1.0 +exceptiongroup==1.2.2 +fastavro==1.9.5 +filelock==3.15.4 +filetype==1.2.0 +flatbuffers==24.3.25 +flupy==1.2.0 +fonttools==4.53.1 +freshbooks-sdk==1.2.1 +fsspec==2024.6.1 +geomet==0.2.1.post1 +google-ai-generativelanguage==0.1.0 +google-api-core==2.19.1 +google-api-python-client==2.140.0 +google-auth==2.33.0 +google-auth-httplib2==0.2.0 +google-auth-oauthlib==1.2.1 +google-cloud-bigquery==3.25.0 +google-cloud-core==2.4.1 +google-cloud-storage==2.18.1 +google-crc32c==1.5.0 +google-generativeai==0.1.0rc1 +google-resumable-media==2.7.1 +googleapis-common-protos==1.63.2 +greenlet==3.0.3 +grpcio==1.65.4 +grpcio-health-checking==1.62.3 +grpcio-status==1.62.3 +grpcio-tools==1.62.3 +h11==0.14.0 +h2==4.1.0 +hpack==4.0.0 +httpcore==1.0.5 +httplib2==0.22.0 +httpx==0.27.0 +httpx-sse==0.4.0 +huggingface-hub==0.24.5 +humanfriendly==10.0 +hyperframe==6.0.1 +idna==3.7 +importlib_metadata==8.2.0 +importlib_resources==6.4.0 +iniconfig==2.0.0 +intuit-oauth==1.2.6 +iopath==0.1.10 +isodate==0.6.1 +Jinja2==3.1.4 +jiter==0.5.0 +jmespath==1.0.1 +joblib==1.4.2 +jsonpath-python==1.0.6 +kiwisolver==1.4.5 +lancedb==0.6.13 +langdetect==1.0.9 +layoutparser==0.3.4 +lxml==5.2.2 +mailchimp-marketing==3.0.80 +Markdown==3.6 +MarkupSafe==2.1.5 +marshmallow==3.21.2 +matplotlib==3.7.5 +more-itertools==10.4.0 +mpmath==1.3.0 +msg-parser==1.2.0 +mypy-extensions==1.0.0 +mysql-connector==2.2.9 +mysql-connector-python==9.0.0 +neo4j==5.23.1 +nest-asyncio==1.6.0 +networkx==3.1 +nltk==3.8.1 +numpy==1.24.4 +oauthlib==3.2.2 +olefile==0.47 +omegaconf==2.3.0 +onnx==1.16.2 +onnxruntime==1.15.1 +openai==1.40.1 +opencv-python==4.10.0.84 +openpyxl==3.1.5 +ordered-set==4.1.0 +overrides==7.7.0 +packaging==24.1 +pandas==2.0.3 +parameterized==0.9.0 +parsimonious==0.10.0 +paystackapi==2.1.3 +pdf2image==1.17.0 +pdfminer.six==20231228 +pdfplumber==0.11.3 +pgvector==0.1.8 +pikepdf==9.1.0 +pillow==10.4.0 +pinecone-client==5.0.1 +pinecone-plugin-inference==1.0.3 +pinecone-plugin-interface==0.0.7 +platformdirs==4.2.2 +pluggy==1.5.0 +ply==3.11 +portalocker==2.10.1 +proto-plus==1.24.0 +protobuf==4.25.4 +psycopg2-binary==2.9.9 +py==1.11.0 +pyarrow==15.0.0 +pyasn1==0.6.0 +pyasn1_modules==0.4.0 +pycocotools==2.0.7 +pycparser==2.22 +pycryptodome==3.20.0 +pydantic==2.8.2 +pydantic_core==2.20.1 +PyJWT==2.9.0 +pylance==0.10.12 +pymongo==4.8.0 +Pympler==1.1 +PyMySQL==1.1.1 +pyOpenSSL==24.2.1 +pypandoc==1.13 +pyparsing==3.1.2 +pypdf==4.3.1 +pypdfium2==4.30.0 +pyproject_hooks==1.1.0 +pyreadline3==3.4.1 +pytesseract==0.3.10 +pytest==8.3.2 +python-dateutil==2.9.0.post0 +python-docx==1.1.2 +python-dotenv==1.0.1 +python-iso639==2024.4.27 +python-magic==0.4.27 +python-multipart==0.0.9 +python-paypal-api==0.1.2 +python-pptx==0.6.23 +pytz==2024.1 +pywin32==306 +PyYAML==6.0.2 +qdrant-client==1.10.1 +rapidfuzz==3.9.6 +ratelimiter==1.2.0.post0 +rave-python==1.4.0 +regex==2024.7.24 +requests==2.32.3 +requests-file==2.1.0 +requests-oauthlib==2.0.0 +requests-toolbelt==1.0.0 +retry==0.9.2 +rsa==4.9 +s3transfer==0.10.2 +safetensors==0.4.4 +scipy==1.10.1 +semver==3.0.2 +simple-salesforce==1.12.6 +singlestoredb==1.6.2 +six==1.16.0 +sniffio==1.3.1 +snowflake-connector-python==3.12.0 +sortedcontainers==2.4.0 +soupsieve==2.5 +SQLAlchemy==2.0.32 +sqlparams==6.0.1 +stone==3.3.1 +stripe==10.6.0 +sympy==1.13.1 +tabulate==0.9.0 +tiktoken==0.7.0 +timm==1.0.8 +tokenizers==0.19.1 +tomli==2.0.1 +tomlkit==0.13.0 +torch==2.4.0 +torchvision==0.19.0 +tqdm==4.66.5 +transformers==4.44.0 +types-requests==2.31.0.6 +types-urllib3==1.26.25.14 +typing-inspect==0.9.0 +typing_extensions==4.12.2 +tzdata==2024.1 +unstructured==0.11.8 +unstructured-client==0.25.4 +unstructured-inference==0.7.18 +unstructured.pytesseract==0.3.12 +uritemplate==4.1.1 +urllib3==1.26.19 +validators==0.33.0 +vecs==0.4.4 +weaviate-client==4.7.1 +wrapt==1.16.0 +xlrd==2.0.1 +XlsxWriter==3.2.0 +zeep==4.2.1 +zenpy==2.0.49 +zipp==3.19.2 +zohocrmsdk7_0==2.0.0 diff --git a/vector_etl/source_mods/mailchimp_loader.py b/vector_etl/source_mods/mailchimp_loader.py index 4277c58..d77a49d 100644 --- a/vector_etl/source_mods/mailchimp_loader.py +++ b/vector_etl/source_mods/mailchimp_loader.py @@ -1,4 +1,4 @@ -from base import BaseSource +from .base import BaseSource import mailchimp_marketing as MailchimpMarketing from mailchimp_marketing.api_client import ApiClientError import pandas as pd From bf890c2e9a45c7b2050e3a25eeddd3469bb2efba Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Fri, 6 Sep 2024 17:55:24 -0800 Subject: [PATCH 20/34] updates airtable loader --- vector_etl/source_mods/airtable_loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vector_etl/source_mods/airtable_loader.py b/vector_etl/source_mods/airtable_loader.py index 0e14b58..35889c1 100644 --- a/vector_etl/source_mods/airtable_loader.py +++ b/vector_etl/source_mods/airtable_loader.py @@ -1,6 +1,5 @@ import requests from .base import BaseSource -from pprint import pprint import pandas as pd import logging From fb45322ba6ba167de6b6ea8f92180d5e6b02a48c Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Fri, 6 Sep 2024 18:01:21 -0800 Subject: [PATCH 21/34] updated bigquery --- vector_etl/source_mods/google_bigquery.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vector_etl/source_mods/google_bigquery.py b/vector_etl/source_mods/google_bigquery.py index 51e979d..c82a5a2 100644 --- a/vector_etl/source_mods/google_bigquery.py +++ b/vector_etl/source_mods/google_bigquery.py @@ -1,6 +1,6 @@ import os from google.cloud import bigquery -from base import BaseSource +from .base import BaseSource import logging logging.basicConfig(level=logging.INFO) From 2d4b77521d92352aca595f42acc0c9bc40f19c1e Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Sat, 7 Sep 2024 07:18:22 -0800 Subject: [PATCH 22/34] update source config file --- examples/airtable_to_pincone.yaml | 2 ++ examples/flutterwave_to_pinecone.yaml | 2 ++ examples/gmail_to_pinecone.yaml | 2 ++ examples/google_bigquery_to_pincone.yaml | 1 + examples/hubspot_to_pinecone.yaml | 3 ++- examples/intercom_to_pinecone.yaml | 3 +++ examples/mailchimp_to_pinecone.yaml | 2 ++ examples/paystack_to_pincone.yaml | 2 ++ examples/zohocrm_to_pinecone.yaml | 2 ++ examples/zohodesk_to_pinecone.yaml | 4 +++- vector_etl/source_mods/__init__.py | 2 +- vector_etl/source_mods/airtable_loader.py | 18 +++++++++++++++++- vector_etl/source_mods/paystack_loader.py | 1 - 13 files changed, 39 insertions(+), 5 deletions(-) diff --git a/examples/airtable_to_pincone.yaml b/examples/airtable_to_pincone.yaml index 5e95087..d28a5ae 100644 --- a/examples/airtable_to_pincone.yaml +++ b/examples/airtable_to_pincone.yaml @@ -18,4 +18,6 @@ target: cloud: "aws" #[Optional] Only required if creating a new index region: "us-east-1" #[Optional] Only required if creating a new index +embed_columns: [] + diff --git a/examples/flutterwave_to_pinecone.yaml b/examples/flutterwave_to_pinecone.yaml index 8e93e83..54fae51 100644 --- a/examples/flutterwave_to_pinecone.yaml +++ b/examples/flutterwave_to_pinecone.yaml @@ -17,4 +17,6 @@ target: cloud: "aws" #[Optional] Only required if creating a new index region: "us-east-1" #[Optional] Only required if creating a new index +embed_columns: [] + diff --git a/examples/gmail_to_pinecone.yaml b/examples/gmail_to_pinecone.yaml index 794db28..e342151 100644 --- a/examples/gmail_to_pinecone.yaml +++ b/examples/gmail_to_pinecone.yaml @@ -17,4 +17,6 @@ target: cloud: "aws" #[Optional] Only required if creating a new index region: "us-east-1" #[Optional] Only required if creating a new index +embed_columns: [] + diff --git a/examples/google_bigquery_to_pincone.yaml b/examples/google_bigquery_to_pincone.yaml index 3075154..a6df386 100644 --- a/examples/google_bigquery_to_pincone.yaml +++ b/examples/google_bigquery_to_pincone.yaml @@ -17,4 +17,5 @@ target: cloud: "aws" #[Optional] Only required if creating a new index region: "us-east-1" #[Optional] Only required if creating a new index +embed_columns: [] diff --git a/examples/hubspot_to_pinecone.yaml b/examples/hubspot_to_pinecone.yaml index aaaff75..5a0f4b1 100644 --- a/examples/hubspot_to_pinecone.yaml +++ b/examples/hubspot_to_pinecone.yaml @@ -2,7 +2,6 @@ source: source_data_type: "HubSpot" archive: "" limit: "" - client_secret: "" access_token: "" crm_object: "" @@ -20,4 +19,6 @@ target: cloud: "aws" #[Optional] Only required if creating a new index region: "us-east-1" #[Optional] Only required if creating a new index +embed_columns: [] + diff --git a/examples/intercom_to_pinecone.yaml b/examples/intercom_to_pinecone.yaml index 55195d6..bc16438 100644 --- a/examples/intercom_to_pinecone.yaml +++ b/examples/intercom_to_pinecone.yaml @@ -18,3 +18,6 @@ target: region: "us-east-1" #[Optional] Only required if creating a new index +embed_columns: [] + + diff --git a/examples/mailchimp_to_pinecone.yaml b/examples/mailchimp_to_pinecone.yaml index 8fd7087..e44d0e7 100644 --- a/examples/mailchimp_to_pinecone.yaml +++ b/examples/mailchimp_to_pinecone.yaml @@ -18,4 +18,6 @@ target: cloud: "aws" #[Optional] Only required if creating a new index region: "us-east-1" #[Optional] Only required if creating a new index +embed_columns: [] + diff --git a/examples/paystack_to_pincone.yaml b/examples/paystack_to_pincone.yaml index b301a73..99adffa 100644 --- a/examples/paystack_to_pincone.yaml +++ b/examples/paystack_to_pincone.yaml @@ -17,4 +17,6 @@ target: cloud: "aws" #[Optional] Only required if creating a new index region: "us-east-1" #[Optional] Only required if creating a new index +embed_columns: [] + diff --git a/examples/zohocrm_to_pinecone.yaml b/examples/zohocrm_to_pinecone.yaml index 8d55cb1..7753a5d 100644 --- a/examples/zohocrm_to_pinecone.yaml +++ b/examples/zohocrm_to_pinecone.yaml @@ -22,4 +22,6 @@ target: cloud: "aws" #[Optional] Only required if creating a new index region: "us-east-1" #[Optional] Only required if creating a new index +embed_columns: [] + diff --git a/examples/zohodesk_to_pinecone.yaml b/examples/zohodesk_to_pinecone.yaml index 8d408b3..7a929ed 100644 --- a/examples/zohodesk_to_pinecone.yaml +++ b/examples/zohodesk_to_pinecone.yaml @@ -4,7 +4,7 @@ source: client_id: "" client_secret: "" code: "" - per_page: "" + limit: "" records: "" accounts_url: "" @@ -22,4 +22,6 @@ target: cloud: "aws" #[Optional] Only required if creating a new index region: "us-east-1" #[Optional] Only required if creating a new index +embed_columns: [] + diff --git a/vector_etl/source_mods/__init__.py b/vector_etl/source_mods/__init__.py index c6d3b7f..63c0097 100644 --- a/vector_etl/source_mods/__init__.py +++ b/vector_etl/source_mods/__init__.py @@ -52,7 +52,7 @@ def get_source_class(config): return ZohoDeskSource(config) elif source_type == "InterCom": return InterComSource(config) - elif source_type == 'PayStackS': + elif source_type == 'PayStacks': return PayStackSource(config) elif source_type == "FlutterWave": return FlutterWaveSource(config) diff --git a/vector_etl/source_mods/airtable_loader.py b/vector_etl/source_mods/airtable_loader.py index 35889c1..951aa92 100644 --- a/vector_etl/source_mods/airtable_loader.py +++ b/vector_etl/source_mods/airtable_loader.py @@ -1,5 +1,5 @@ import requests -from .base import BaseSource +from base import BaseSource import pandas as pd import logging @@ -31,7 +31,23 @@ def fetch_data(self): airtable_df = pd.DataFrame(df_data) return airtable_df + + + +config = { + "auth_token": "patbteTvK8Hp5ceLd.d2efae5a90755f783f475c3515e7b17752f1e313bdb5c34d96726203838a709c", + "baseId": "appjx8zUtVJcjvxys", + "tableIdOrName": "Sales" +} + + + +table = AirTableSource(config) + + +data = table.fetch_data() +print(data) diff --git a/vector_etl/source_mods/paystack_loader.py b/vector_etl/source_mods/paystack_loader.py index 0fd7b8a..b27c8a4 100644 --- a/vector_etl/source_mods/paystack_loader.py +++ b/vector_etl/source_mods/paystack_loader.py @@ -1,7 +1,6 @@ from .base import BaseSource import logging import pandas as pd - from paystackapi.paystack import Paystack From 2ceefa84cbe3e86907b5687ddf9d61f83dd4bba6 Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Sat, 7 Sep 2024 07:35:14 -0800 Subject: [PATCH 23/34] update source config file --- vector_etl/source_mods/airtable_loader.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/vector_etl/source_mods/airtable_loader.py b/vector_etl/source_mods/airtable_loader.py index 951aa92..35889c1 100644 --- a/vector_etl/source_mods/airtable_loader.py +++ b/vector_etl/source_mods/airtable_loader.py @@ -1,5 +1,5 @@ import requests -from base import BaseSource +from .base import BaseSource import pandas as pd import logging @@ -31,23 +31,7 @@ def fetch_data(self): airtable_df = pd.DataFrame(df_data) return airtable_df - - - -config = { - "auth_token": "patbteTvK8Hp5ceLd.d2efae5a90755f783f475c3515e7b17752f1e313bdb5c34d96726203838a709c", - "baseId": "appjx8zUtVJcjvxys", - "tableIdOrName": "Sales" -} - - - -table = AirTableSource(config) - - -data = table.fetch_data() -print(data) From a591af41691dd2a438d3218c725e10f744d9f285 Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Sat, 7 Sep 2024 08:10:33 -0800 Subject: [PATCH 24/34] update setup --- examples/zohodesk_to_pinecone.yaml | 1 - setup.py | 3 +++ vector_etl/source_mods/zoho_desk_loader.py | 6 +++--- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/zohodesk_to_pinecone.yaml b/examples/zohodesk_to_pinecone.yaml index 7a929ed..6059df7 100644 --- a/examples/zohodesk_to_pinecone.yaml +++ b/examples/zohodesk_to_pinecone.yaml @@ -4,7 +4,6 @@ source: client_id: "" client_secret: "" code: "" - limit: "" records: "" accounts_url: "" diff --git a/setup.py b/setup.py index 32e3a6b..d34fecf 100644 --- a/setup.py +++ b/setup.py @@ -69,6 +69,9 @@ "pytest", "nltk", "pymilvus", + "zohocrmsdk7_0", + "paystackapi", + "mailchimp-marketing" ], entry_points={ "console_scripts": [ diff --git a/vector_etl/source_mods/zoho_desk_loader.py b/vector_etl/source_mods/zoho_desk_loader.py index 9a083b8..df073af 100644 --- a/vector_etl/source_mods/zoho_desk_loader.py +++ b/vector_etl/source_mods/zoho_desk_loader.py @@ -68,7 +68,7 @@ def fetch_data(self): self.token = self.connect() if self.config['records'] == 'desk.agents': logger.info("Agents \n") - self.url = f"https://desk.zoho.com/api/v1/agents?limit={self.config['limit']}" + self.url = f"https://desk.zoho.com/api/v1/agents" elif self.config['records'] == 'desk.team': @@ -92,12 +92,12 @@ def fetch_data(self): elif self.config['records'] == 'desk.ticket': logger.info("Ticket \n") self.url = f"""https://desk.zoho.com/api/v1/tickets?include=contacts, - assignee,departments,team,isRead&limit={self.config['limit']}""" + assignee,departments,team,isRead""" elif self.config['records'] == 'desk.contacts': logger.info("Contact \n") - self.url = f"https://desk.zoho.com/api/v1/contacts?limit={self.config['limit']}" + self.url = f"https://desk.zoho.com/api/v1/contacts" From 2ecdd07e7f60f074d360bb8364d0cb6067ff68be Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Sun, 8 Sep 2024 02:02:31 -0800 Subject: [PATCH 25/34] update hubspot source to raise valueerror is limit value is greater than 100 --- vector_etl/source_mods/hubspot_loader.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/vector_etl/source_mods/hubspot_loader.py b/vector_etl/source_mods/hubspot_loader.py index ca93421..d396fba 100644 --- a/vector_etl/source_mods/hubspot_loader.py +++ b/vector_etl/source_mods/hubspot_loader.py @@ -72,12 +72,17 @@ def fetch_data(self): raise ValueError(f"Unsupported Crm object type: check the object name {self.config['crm_object']}") - response = self.connect(self.endpoints)['results'] - results = [results['properties'] for results in response] - df = pd.DataFrame(results) - logger.info(f" data \n {df}") + response = self.connect(self.endpoints) + + if 'results' in response: + print(response) + results = [results['properties'] for results in response['results']] + df = pd.DataFrame(results) + logger.info(f" data \n {df}") + return df + else: + raise ValueError(response['message']) - return df From 7104583154eac1015d4812b8f070535805ef0a7b Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Sun, 8 Sep 2024 17:44:18 -0800 Subject: [PATCH 26/34] added DigitalOcean Source --- examples/digital_ocean_space_to_pincone.yaml | 25 +++++++ .../digital_ocean_spaces_loader.py | 72 +++++++++++++++++++ vector_etl/source_mods/gmail_loader.py | 2 +- 3 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 examples/digital_ocean_space_to_pincone.yaml create mode 100644 vector_etl/source_mods/digital_ocean_spaces_loader.py diff --git a/examples/digital_ocean_space_to_pincone.yaml b/examples/digital_ocean_space_to_pincone.yaml new file mode 100644 index 0000000..fd2c771 --- /dev/null +++ b/examples/digital_ocean_space_to_pincone.yaml @@ -0,0 +1,25 @@ +source: + source_data_type: "DigitalOcean" + bucket_name: "scrap-data" + prefix: "latestArticles_Monday-26-August-2024" + region_name: 'https://nyc3.digitaloceanspaces.com' + endpoint_url: 'nyc3' + file_type: "csv" #required if prefix is a directory: Will retrieve all files with filetype + aws_access_key_id: "your-access-key" + aws_secret_access_key: "your-secret-access-key" + +embedding: + embedding_model: "OpenAI" + api_key: "your-openai-api-key" + model_name: "text-embedding-ada-002" + +target: + target_database: "Pinecone" + pinecone_api_key: "your-pinecone-api-key" + index_name: "my-index" + dimension: 1536 #[Optional] Only required if creating a new index + metric: "cosine" #[Optional] Only required if creating a new index + cloud: "aws" #[Optional] Only required if creating a new index + region: "us-east-1" #[Optional] Only required if creating a new index + +embed_columns: [] #Empty Array: File based sources do not require embedding columns \ No newline at end of file diff --git a/vector_etl/source_mods/digital_ocean_spaces_loader.py b/vector_etl/source_mods/digital_ocean_spaces_loader.py new file mode 100644 index 0000000..0b62efb --- /dev/null +++ b/vector_etl/source_mods/digital_ocean_spaces_loader.py @@ -0,0 +1,72 @@ +import boto3 +import logging +from io import BytesIO +import os +from .file_loader import FileBaseSource + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class DigitalOceanSpaceSource(FileBaseSource): + def __init__(self, config): + super().__init__(config) + self.s3_client = None + self.bucket_name = config['bucket_name'] + self.prefix = config.get('prefix', '') + self.file_type = config.get('file_type', '') + + def connect(self): + logger.info("Connecting to DigitalOcean Space...") + self.s3_client = boto3.client( + 's3', + region_name=self.config['region_name'], + endpoint_url=self.config['endpoint_url'], + aws_access_key_id=self.config['aws_access_key_id'], + aws_secret_access_key=self.config['aws_secret_access_key'] + ) + logger.info("Connected to DigitalOcean Space successfully.") + + def list_files(self): + if not self.s3_client: + self.connect() + + paginator = self.s3_client.get_paginator('list_objects_v2') + files = [] + for page in paginator.paginate(Bucket=self.bucket_name, Prefix=self.prefix): + for obj in page.get('Contents', []): + if obj['Key'].endswith(self.file_type): + files.append(obj['Key']) + + return files + + def read_file(self, file_path): + downloaded_files = [] + + local_file_path = os.path.join(os.getcwd(), file_path.split('/')[-1]) + self.s3_client.download_file(self.bucket_name, file_path, local_file_path) + downloaded_files.append(file_path) + logger.info(f"Downloaded {file_path} to {os.getcwd()}") + + return downloaded_files + + def download_file(self, file_path): + if not self.s3_client: + self.connect() + + download_folder = 'tempfile_downloads' + if not os.path.exists(download_folder): + os.makedirs(download_folder) + + logger.info("Downloading files from DigitalOcean Space...") + + local_file_path = os.path.join(download_folder, file_path.split('/')[-1]) + self.s3_client.download_file(self.bucket_name, file_path, local_file_path) + logger.info(f"Downloaded {file_path} to {os.getcwd()}") + + def delete_directory(self, path): + for root, dirs, files in os.walk(path, topdown=False): + for file in files: + os.remove(os.path.join(root, file)) + for dir in dirs: + os.rmdir(os.path.join(root, dir)) + os.rmdir(path) diff --git a/vector_etl/source_mods/gmail_loader.py b/vector_etl/source_mods/gmail_loader.py index 9858bd0..1eaedbe 100644 --- a/vector_etl/source_mods/gmail_loader.py +++ b/vector_etl/source_mods/gmail_loader.py @@ -71,7 +71,7 @@ def parse_messages(self, service, messages, label): msg = service.users().messages().get(userId="me", id=message["id"]).execute() headers = msg["payload"]["headers"] - subject, sender = self._get_header_info(headers) + subject, sender = self.get_header_info(headers) snippet = msg.get("snippet", "") body = self.get_body(msg) From 8354c17f30a32ad2f6d962616b588c04edbf565a Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Sun, 8 Sep 2024 18:01:29 -0800 Subject: [PATCH 27/34] added digitalOcean Source Type --- vector_etl/source_mods/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vector_etl/source_mods/__init__.py b/vector_etl/source_mods/__init__.py index 63c0097..9b808d0 100644 --- a/vector_etl/source_mods/__init__.py +++ b/vector_etl/source_mods/__init__.py @@ -19,7 +19,7 @@ from .flutterwave_loader import FlutterWaveSource from .gmail_loader import GmailSource from .mailchimp_loader import MailChimpMarketingSource - +from .digital_ocean_spaces_loader import DigitalOceanSpaceSource def get_source_class(config): source_type = config['source_data_type'] if source_type == 'Amazon S3': @@ -60,5 +60,7 @@ def get_source_class(config): return GmailSource(config) elif source_type == "MailChimp": return MailChimpMarketingSource(config) + elif source_type == "DigitalOcean": + DigitalOceanSpaceSource(config) else: raise ValueError(f"Unsupported source type: {source_type}") From 81728af63a5b6d14de290288f711f38babf40041 Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Sun, 8 Sep 2024 18:52:59 -0800 Subject: [PATCH 28/34] updated digitalOcean source --- vector_etl/source_mods/digital_ocean_spaces_loader.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vector_etl/source_mods/digital_ocean_spaces_loader.py b/vector_etl/source_mods/digital_ocean_spaces_loader.py index 0b62efb..4c5a0af 100644 --- a/vector_etl/source_mods/digital_ocean_spaces_loader.py +++ b/vector_etl/source_mods/digital_ocean_spaces_loader.py @@ -49,6 +49,7 @@ def read_file(self, file_path): return downloaded_files + def download_file(self, file_path): if not self.s3_client: self.connect() @@ -63,10 +64,12 @@ def download_file(self, file_path): self.s3_client.download_file(self.bucket_name, file_path, local_file_path) logger.info(f"Downloaded {file_path} to {os.getcwd()}") + def delete_directory(self, path): + for root, dirs, files in os.walk(path, topdown=False): for file in files: os.remove(os.path.join(root, file)) for dir in dirs: os.rmdir(os.path.join(root, dir)) - os.rmdir(path) + os.rmdir(path) \ No newline at end of file From 978ade356a3071a92819f02a9fb7fdf09ab5ad10 Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Sun, 8 Sep 2024 18:58:03 -0800 Subject: [PATCH 29/34] updated digitalOcean source --- vector_etl/source_mods/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vector_etl/source_mods/__init__.py b/vector_etl/source_mods/__init__.py index 9b808d0..c4687c4 100644 --- a/vector_etl/source_mods/__init__.py +++ b/vector_etl/source_mods/__init__.py @@ -20,6 +20,7 @@ from .gmail_loader import GmailSource from .mailchimp_loader import MailChimpMarketingSource from .digital_ocean_spaces_loader import DigitalOceanSpaceSource + def get_source_class(config): source_type = config['source_data_type'] if source_type == 'Amazon S3': @@ -61,6 +62,6 @@ def get_source_class(config): elif source_type == "MailChimp": return MailChimpMarketingSource(config) elif source_type == "DigitalOcean": - DigitalOceanSpaceSource(config) + return DigitalOceanSpaceSource(config) else: raise ValueError(f"Unsupported source type: {source_type}") From 4e754e9d5aad7449e54495ba53e75f39cac86efa Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Fri, 13 Sep 2024 14:45:46 -0800 Subject: [PATCH 30/34] remove secrete --- examples/airtable_to_pincone.yaml | 8 +- examples/digital_ocean_space_to_pincone.yaml | 19 +-- examples/flutterwave_to_pinecone.yaml | 22 --- examples/gmail_to_pinecone.yaml | 22 --- examples/hubspot_to_pinecone.yaml | 8 +- examples/intercom_to_pinecone.yaml | 2 +- examples/mailchimp_to_pinecone.yaml | 23 --- examples/paystack_to_pincone.yaml | 22 --- examples/zohocrm_to_pinecone.yaml | 27 ---- examples/zohodesk_to_pinecone.yaml | 26 --- setup.py | 3 - vector_etl/source_mods/flutterwave_loader.py | 113 ------------- vector_etl/source_mods/gmail_loader.py | 102 ------------ vector_etl/source_mods/google_bigquery.py | 45 ------ vector_etl/source_mods/mailchimp_loader.py | 100 ------------ vector_etl/source_mods/paystack_loader.py | 108 ------------- vector_etl/source_mods/zoho_crm_loader.py | 162 ------------------- vector_etl/source_mods/zoho_desk_loader.py | 116 ------------- 18 files changed, 20 insertions(+), 908 deletions(-) delete mode 100644 examples/flutterwave_to_pinecone.yaml delete mode 100644 examples/gmail_to_pinecone.yaml delete mode 100644 examples/mailchimp_to_pinecone.yaml delete mode 100644 examples/paystack_to_pincone.yaml delete mode 100644 examples/zohocrm_to_pinecone.yaml delete mode 100644 examples/zohodesk_to_pinecone.yaml delete mode 100644 vector_etl/source_mods/flutterwave_loader.py delete mode 100644 vector_etl/source_mods/gmail_loader.py delete mode 100644 vector_etl/source_mods/google_bigquery.py delete mode 100644 vector_etl/source_mods/mailchimp_loader.py delete mode 100644 vector_etl/source_mods/paystack_loader.py delete mode 100644 vector_etl/source_mods/zoho_crm_loader.py delete mode 100644 vector_etl/source_mods/zoho_desk_loader.py diff --git a/examples/airtable_to_pincone.yaml b/examples/airtable_to_pincone.yaml index d28a5ae..32b0d38 100644 --- a/examples/airtable_to_pincone.yaml +++ b/examples/airtable_to_pincone.yaml @@ -1,7 +1,7 @@ source: source_data_type: "AirTable" auth_token: "" - baseId: "sales" + baseId: "" tableIdOrName: "" embedding: @@ -11,13 +11,13 @@ embedding: target: target_database: "Pinecone" - pinecone_api_key: "" - index_name: "" + pinecone_api_key: "0f6d7a64-f37c-4c72-a9f6-d00f9b1db2c2" + index_name: "context-data-etl-test" dimension: 1536 #[Optional] Only required if creating a new index metric: "cosine" #[Optional] Only required if creating a new index cloud: "aws" #[Optional] Only required if creating a new index region: "us-east-1" #[Optional] Only required if creating a new index -embed_columns: [] +embed_columns: [] diff --git a/examples/digital_ocean_space_to_pincone.yaml b/examples/digital_ocean_space_to_pincone.yaml index fd2c771..5efb031 100644 --- a/examples/digital_ocean_space_to_pincone.yaml +++ b/examples/digital_ocean_space_to_pincone.yaml @@ -1,12 +1,12 @@ source: source_data_type: "DigitalOcean" - bucket_name: "scrap-data" - prefix: "latestArticles_Monday-26-August-2024" - region_name: 'https://nyc3.digitaloceanspaces.com' - endpoint_url: 'nyc3' + bucket_name: "" + prefix: "" + region_name: '' + endpoint_url: '' file_type: "csv" #required if prefix is a directory: Will retrieve all files with filetype - aws_access_key_id: "your-access-key" - aws_secret_access_key: "your-secret-access-key" + aws_access_key_id: "" + aws_secret_access_key: "" embedding: embedding_model: "OpenAI" @@ -15,11 +15,12 @@ embedding: target: target_database: "Pinecone" - pinecone_api_key: "your-pinecone-api-key" - index_name: "my-index" + pinecone_api_key: "" + index_name: "" dimension: 1536 #[Optional] Only required if creating a new index metric: "cosine" #[Optional] Only required if creating a new index cloud: "aws" #[Optional] Only required if creating a new index region: "us-east-1" #[Optional] Only required if creating a new index -embed_columns: [] #Empty Array: File based sources do not require embedding columns \ No newline at end of file + +embed_columns: [] \ No newline at end of file diff --git a/examples/flutterwave_to_pinecone.yaml b/examples/flutterwave_to_pinecone.yaml deleted file mode 100644 index 54fae51..0000000 --- a/examples/flutterwave_to_pinecone.yaml +++ /dev/null @@ -1,22 +0,0 @@ -source: - source_data_type: "FlutterWave" - secret_key: "" - records: "flutterwave.payout-subaccounts" - -embedding: - embedding_model: "OpenAI" - api_key: "" - model_name: "text-embedding-ada-002" - -target: - target_database: "Pinecone" - pinecone_api_key: "" - index_name: "" - dimension: 1536 #[Optional] Only required if creating a new index - metric: "cosine" #[Optional] Only required if creating a new index - cloud: "aws" #[Optional] Only required if creating a new index - region: "us-east-1" #[Optional] Only required if creating a new index - -embed_columns: [] - - diff --git a/examples/gmail_to_pinecone.yaml b/examples/gmail_to_pinecone.yaml deleted file mode 100644 index e342151..0000000 --- a/examples/gmail_to_pinecone.yaml +++ /dev/null @@ -1,22 +0,0 @@ -source: - source_data_type: "Gmail" - credentisla: "" - gmail.label: 'IMPORTANT' - -embedding: - embedding_model: "OpenAI" - api_key: "" - model_name: "text-embedding-ada-002" - -target: - target_database: "Pinecone" - pinecone_api_key: "" - index_name: "" - dimension: 1536 #[Optional] Only required if creating a new index - metric: "cosine" #[Optional] Only required if creating a new index - cloud: "aws" #[Optional] Only required if creating a new index - region: "us-east-1" #[Optional] Only required if creating a new index - -embed_columns: [] - - diff --git a/examples/hubspot_to_pinecone.yaml b/examples/hubspot_to_pinecone.yaml index 5a0f4b1..05f6836 100644 --- a/examples/hubspot_to_pinecone.yaml +++ b/examples/hubspot_to_pinecone.yaml @@ -1,9 +1,9 @@ source: source_data_type: "HubSpot" - archive: "" - limit: "" + archive: "false" + limit: "100" access_token: "" - crm_object: "" + crm_object: "crm.contacts" embedding: embedding_model: "OpenAI" @@ -19,6 +19,8 @@ target: cloud: "aws" #[Optional] Only required if creating a new index region: "us-east-1" #[Optional] Only required if creating a new index + embed_columns: [] + diff --git a/examples/intercom_to_pinecone.yaml b/examples/intercom_to_pinecone.yaml index bc16438..c7ed855 100644 --- a/examples/intercom_to_pinecone.yaml +++ b/examples/intercom_to_pinecone.yaml @@ -1,6 +1,6 @@ source: source_data_type: "InterCom" - token: "FlutterWave" + token: "" records: "intercom.teams" embedding: diff --git a/examples/mailchimp_to_pinecone.yaml b/examples/mailchimp_to_pinecone.yaml deleted file mode 100644 index e44d0e7..0000000 --- a/examples/mailchimp_to_pinecone.yaml +++ /dev/null @@ -1,23 +0,0 @@ -source: - source_data_type: "MailChimp" - api_key: "" - server_prefix: "us13" - records: "ConnectedSites" - -embedding: - embedding_model: "OpenAI" - api_key: "" - model_name: "text-embedding-ada-002" - -target: - target_database: "Pinecone" - pinecone_api_key: "" - index_name: "" - dimension: 1536 #[Optional] Only required if creating a new index - metric: "cosine" #[Optional] Only required if creating a new index - cloud: "aws" #[Optional] Only required if creating a new index - region: "us-east-1" #[Optional] Only required if creating a new index - -embed_columns: [] - - diff --git a/examples/paystack_to_pincone.yaml b/examples/paystack_to_pincone.yaml deleted file mode 100644 index 99adffa..0000000 --- a/examples/paystack_to_pincone.yaml +++ /dev/null @@ -1,22 +0,0 @@ -source: - source_data_type: "PayStackS" - paystack_secret_key: "" - records: "paystack.transactions" - -embedding: - embedding_model: "OpenAI" - api_key: "" - model_name: "text-embedding-ada-002" - -target: - target_database: "Pinecone" - pinecone_api_key: "" - index_name: "" - dimension: 1536 #[Optional] Only required if creating a new index - metric: "cosine" #[Optional] Only required if creating a new index - cloud: "aws" #[Optional] Only required if creating a new index - region: "us-east-1" #[Optional] Only required if creating a new index - -embed_columns: [] - - diff --git a/examples/zohocrm_to_pinecone.yaml b/examples/zohocrm_to_pinecone.yaml deleted file mode 100644 index 7753a5d..0000000 --- a/examples/zohocrm_to_pinecone.yaml +++ /dev/null @@ -1,27 +0,0 @@ -source: - source_data_type: "ZohoCrm" - grant_type: "" - client_id: "" - client_secret: "" - code: "" - per_page: "" - records: "" - accounts_url: "" - -embedding: - embedding_model: "OpenAI" - api_key: "" - model_name: "text-embedding-ada-002" - -target: - target_database: "Pinecone" - pinecone_api_key: "" - index_name: "" - dimension: 1536 #[Optional] Only required if creating a new index - metric: "cosine" #[Optional] Only required if creating a new index - cloud: "aws" #[Optional] Only required if creating a new index - region: "us-east-1" #[Optional] Only required if creating a new index - -embed_columns: [] - - diff --git a/examples/zohodesk_to_pinecone.yaml b/examples/zohodesk_to_pinecone.yaml deleted file mode 100644 index 6059df7..0000000 --- a/examples/zohodesk_to_pinecone.yaml +++ /dev/null @@ -1,26 +0,0 @@ -source: - source_data_type: "ZohoDesk" - grant_type: "" - client_id: "" - client_secret: "" - code: "" - records: "" - accounts_url: "" - -embedding: - embedding_model: "OpenAI" - api_key: "" - model_name: "text-embedding-ada-002" - -target: - target_database: "Pinecone" - pinecone_api_key: "" - index_name: "" - dimension: 1536 #[Optional] Only required if creating a new index - metric: "cosine" #[Optional] Only required if creating a new index - cloud: "aws" #[Optional] Only required if creating a new index - region: "us-east-1" #[Optional] Only required if creating a new index - -embed_columns: [] - - diff --git a/setup.py b/setup.py index d34fecf..32e3a6b 100644 --- a/setup.py +++ b/setup.py @@ -69,9 +69,6 @@ "pytest", "nltk", "pymilvus", - "zohocrmsdk7_0", - "paystackapi", - "mailchimp-marketing" ], entry_points={ "console_scripts": [ diff --git a/vector_etl/source_mods/flutterwave_loader.py b/vector_etl/source_mods/flutterwave_loader.py deleted file mode 100644 index d49f70a..0000000 --- a/vector_etl/source_mods/flutterwave_loader.py +++ /dev/null @@ -1,113 +0,0 @@ -from .base import BaseSource -import requests -from pprint import pprint -import logging -import pandas as pd - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -class FlutterWaveSource(BaseSource): - def __init__(self,config): - self.config = config - self.url = None - self.secret_key = self.config['secret_key'] - - - def flatten_dict(self, d, parent_key='', sep='_'): - items = [] - for k, v in d.items(): - new_key = f"{parent_key}{sep}{k}" if parent_key else k - if isinstance(v, dict): - items.extend(self.flatten_dict(v, new_key, sep=sep).items()) - elif isinstance(v, list): - for i, item in enumerate(v): - if isinstance(item, dict): - items.extend(self.flatten_dict(item, f"{new_key}{sep}{i}", sep=sep).items()) - else: - items.append((f"{new_key}{sep}{i}", item)) - else: - items.append((new_key, v)) - return dict(items) - - - def connect(self,url): - headers = {"Authorization": f"Bearer {self.secret_key}", - "Content-type":"application/json", - "Intercom-Version":"2.11"} - response = requests.get(url=url,headers=headers) - - return response - - - def fetch_data(self): - - if self.config['records'] == 'flutterwave.transfers': - logger.info(" Transfers \n") - self.url = f"https://api.flutterwave.com/v3/transfers" - - response = self.connect(self.url).json()['data'] - - elif self.config['records'] == 'flutterwave.transactions': - logger.info(" transactions \n") - self.url = f"https://api.flutterwave.com/v3/transactions" - - response = self.connect(self.url).json()['data'] - - - elif self.config['records'] == 'flutterwave.beneficiaries': - logger.info(" Transfers \n") - self.url = f"https://api.flutterwave.com/v3/beneficiaries" - - response = self.connect(self.url).json()['data'] - - - elif self.config['records'] == 'flutterwave.subaccounts': - logger.info(" subaccounts \n") - self.url = f"https://api.flutterwave.com/v3/subaccounts" - - response = self.connect(self.url).json()['data'] - - elif self.config['records'] == 'flutterwave.payout-subaccounts': - logger.info(" payout-subaccounts \n") - self.url = f"https://api.flutterwave.com/v3/payout-subaccounts" - - response = self.connect(self.url).json()['data'] - - elif self.config['records'] == 'flutterwave.subscriptions': - logger.info(" subscriptions \n") - self.url = f"https://api.flutterwave.com/v3/subscriptions" - - response = self.connect(self.url).json()['data'] - - - elif self.config['records'] == 'flutterwave.payment-plans': - logger.info(" payment-plans \n") - self.url = f"https://api.flutterwave.com/v3/payment-plans" - - response = self.connect(self.url).json()['data'] - - - - - - try: - flattened_data = [self.flatten_dict(item) for item in response] - pprint(flattened_data,indent=4) - - df = pd.DataFrame(flattened_data ) - - logger.info(f" data \n {df}") - - return df - except requests.exceptions.HTTPError as http_err: - logger.error(f"HTTP error occurred: {http_err}") - - - - - - - - - diff --git a/vector_etl/source_mods/gmail_loader.py b/vector_etl/source_mods/gmail_loader.py deleted file mode 100644 index 1eaedbe..0000000 --- a/vector_etl/source_mods/gmail_loader.py +++ /dev/null @@ -1,102 +0,0 @@ -import os.path -import base64 -from google.auth.transport.requests import Request -from google.oauth2.credentials import Credentials -from google_auth_oauthlib.flow import InstalledAppFlow -from googleapiclient.discovery import build -from googleapiclient.errors import HttpError -from pprint import pprint -from .base import BaseSource -import pandas as pd - -class GmailSource(BaseSource): - def __init__(self, config): - self.config = config - self.SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"] - - def connect(self): - creds = None - if os.path.exists("token.json"): - creds = Credentials.from_authorized_user_file("token.json", self.SCOPES) - if not creds or not creds.valid: - if creds and creds.expired and creds.refresh_token: - creds.refresh(Request()) - else: - flow = InstalledAppFlow.from_client_secrets_file( - self.config['credentials'], self.SCOPES - ) - creds = flow.run_local_server(port=0) - with open("token.json", "w") as token: - token.write(creds.to_json()) - return creds - - def fetch_data(self): - creds = self.connect() - service = build("gmail", "v1", credentials=creds) - - # Extract the label from config - label = self.config.get('gmail.label').upper() - - messages = self.get_messages(service, label) - - if messages: - email_data = self.parse_messages(service, messages, label) - df = pd.DataFrame(email_data) - - return df - else: - print("No messages found.") - return None - - def get_messages(self, service, label): - try: - results = service.users().messages().list(userId="me", labelIds=[label]).execute() - return results.get("messages", []) - except HttpError as error: - print(f"An error occurred while fetching messages for label {label}: {error}") - return None - - def parse_messages(self, service, messages, label): - email_data = { - "id": [], - "threadId": [], - "label": [], - "subject": [], - "from": [], - "snippet": [], - "body": [], - } - - for message in messages: - msg = service.users().messages().get(userId="me", id=message["id"]).execute() - headers = msg["payload"]["headers"] - - subject, sender = self.get_header_info(headers) - snippet = msg.get("snippet", "") - body = self.get_body(msg) - - email_data["id"].append(message["id"]) - email_data["threadId"].append(message["threadId"]) - email_data["label"].append(label) - email_data["subject"].append(subject) - email_data["from"].append(sender) - email_data["snippet"].append(snippet) - email_data["body"].append(body) - - return email_data - - def get_header_info(self, headers): - subject = None - sender = None - for header in headers: - if header["name"] == "Subject": - subject = header["value"] - if header["name"] == "From": - sender = header["value"] - return subject, sender - - def get_body(self, msg): - if "data" in msg["payload"]["body"]: - return base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8") - else: - return "" diff --git a/vector_etl/source_mods/google_bigquery.py b/vector_etl/source_mods/google_bigquery.py deleted file mode 100644 index c82a5a2..0000000 --- a/vector_etl/source_mods/google_bigquery.py +++ /dev/null @@ -1,45 +0,0 @@ -import os -from google.cloud import bigquery -from .base import BaseSource -import logging - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -class GoogleBigQuerySource(BaseSource): - def __init__(self,config): - self.config = config - self.client = None - self.connect() - - - def connect(self): - if self.config["db_type"] == 'google_bigquery': - os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = self.config['GOOGLE_APPLICATION_CREDENTIALS'] - self.client = bigquery.Client() - - def fetch_data(self): - if self.client: - try: - query_job = self.client.query(f"""{self.config.get("query"," ")}""") - if query_job: - dfrows = query_job.result().to_dataframe() - return dfrows - else: - logger.error(f"No data returned: {e}") - return None - except Exception as e: - logger.error(f"An error occurred: {e}") - return None - - - -config = {"query":"SELECT * FROM bigquery-public-data.america_health_rankings.ahr LIMIT 100", - "GOOGLE_APPLICATION_CREDENTIALS":"contextData_bigquery_cred.json", - "db_type":'google_bigquery' - } - - - - diff --git a/vector_etl/source_mods/mailchimp_loader.py b/vector_etl/source_mods/mailchimp_loader.py deleted file mode 100644 index d77a49d..0000000 --- a/vector_etl/source_mods/mailchimp_loader.py +++ /dev/null @@ -1,100 +0,0 @@ -from .base import BaseSource -import mailchimp_marketing as MailchimpMarketing -from mailchimp_marketing.api_client import ApiClientError -import pandas as pd -import logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) -from pprint import pprint - - - -class MailChimpMarketingSource(BaseSource): - def __init__(self,config): - self.config = config - self.api_key = self.config['api_key'] - self.server_prefix = self.config['server_prefix'] - - - def flatten_dict(self, d, parent_key='', sep='_'): - items = [] - for k, v in d.items(): - new_key = f"{parent_key}{sep}{k}" if parent_key else k - if isinstance(v, dict): - items.extend(self.flatten_dict(v, new_key, sep=sep).items()) - elif isinstance(v, list): - for i, item in enumerate(v): - if isinstance(item, dict): - items.extend(self.flatten_dict(item, f"{new_key}{sep}{i}", sep=sep).items()) - else: - items.append((f"{new_key}{sep}{i}", item)) - else: - items.append((new_key, v)) - return dict(items) - - - def connect(self): - try: - client = MailchimpMarketing.Client() - client.set_config({ - "api_key":self.api_key, - "server": self.server_prefix - }) - return client - except ApiClientError as error: - print("Error: {}".format(error.text)) - - - - - - def fetch_data(self): - client = self.connect() - if self.config['records'] == "campaign": - response = client.campaigns.list()['campaigns'] - - elif self.config['records'] == "campaignFolders": - response = client.campaignFolders.list()['folders'] - - - elif self.config['records'] == "ConnectedSites": - response = client.connectedSites.list()['sites'] - - - elif self.config['records'] == "conversations": - response = client.ecommerce.stores()['conversations'] - - - elif self.config['records'] == "ecommerce": - response = client.conversations.list()['stores'] - - elif self.config['records'] == "facebookAds": - response = client.facebookAds.list()['facebook_ads'] - - elif self.config['records'] == "landingpages": - response = client.landingPages.get_all()['landing_pages'] - - - elif self.config['records'] == "reports": - response = client.reports.get_all_campaign_reports()['reports'] - - - - try: - flattened_data = [self.flatten_dict(item) for item in response] - pprint(flattened_data,indent=4) - - df = pd.DataFrame(flattened_data ) - - logger.info(f" data \n {df}") - - return df - except ApiClientError as error: - logger.error(f"HTTP error occurred: {error.text}") - - - - - - - diff --git a/vector_etl/source_mods/paystack_loader.py b/vector_etl/source_mods/paystack_loader.py deleted file mode 100644 index b27c8a4..0000000 --- a/vector_etl/source_mods/paystack_loader.py +++ /dev/null @@ -1,108 +0,0 @@ -from .base import BaseSource -import logging -import pandas as pd -from paystackapi.paystack import Paystack - - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -class PayStackSource(BaseSource): - def __init__(self,config): - self.config = config - self.paystack_secret_key = self.config['paystack_secret_key'] - - - def flatten_dict(self, d, parent_key='', sep='_'): - items = [] - for k, v in d.items(): - new_key = f"{parent_key}{sep}{k}" if parent_key else k - if isinstance(v, dict): - items.extend(self.flatten_dict(v, new_key, sep=sep).items()) - elif isinstance(v, list): - for i, item in enumerate(v): - if isinstance(item, dict): - items.extend(self.flatten_dict(item, f"{new_key}{sep}{i}", sep=sep).items()) - else: - items.append((f"{new_key}{sep}{i}", item)) - else: - items.append((new_key, v)) - return dict(items) - - - def connect(self): - - paystack = Paystack(secret_key=self.paystack_secret_key) - - return paystack - - - def fetch_data(self): - - if self.config['records'] == 'paystack.transactions': - logger.info(" Transactions \n") - response = self.connect().transaction.list()['data'] - - elif self.config['records'] == 'paystack.transactions.split': - logger.info(" Transactions split \n") - response = self.connect().transactionSplit.list()['data'] - - elif self.config['records'] == 'paystack.invoice': - logger.info(" invoice \n") - response = self.connect().invoice.list()['data'] - - elif self.config['records'] == 'paystack.product': - logger.info(" product \n") - response = self.connect().product.list()['data'] - - elif self.config['records'] == 'paystack.customer': - logger.info(" customer \n") - response = self.connect().customer.list()['data'] - - elif self.config['records'] == 'paystack.plan': - logger.info(" plan \n") - response = self.connect().plan.list()['data'] - - elif self.config['records'] == 'paystack.subaccount': - logger.info(" subaccount \n") - response = self.connect().subaccount().list()['data'] - - - elif self.config['records'] == 'paystack.subscription': - logger.info(" subaccount \n") - response = self.connect().subscription.list()['data'] - - - elif self.config['records'] == 'paystack.transfer': - logger.info(" transfer \n") - response = self.connect().transfer.list()['data'] - - - elif self.config['records'] == 'paystack.bulkcharge': - logger.info(" bulkcharge \n") - response = self.connect().bulkcharge.list()['data'] - - - elif self.config['records'] == 'paystack.refund': - logger.info(" refund \n") - response = self.connect().refund.list()['data'] - - - try: - flattened_data = [self.flatten_dict(item) for item in response] - - df = pd.DataFrame(flattened_data ) - - logger.info(f" data \n {df}") - - return df - except Exception as http_err: - logger.error(f"HTTP error occurred: {http_err}") - - - - - - - - diff --git a/vector_etl/source_mods/zoho_crm_loader.py b/vector_etl/source_mods/zoho_crm_loader.py deleted file mode 100644 index b06c4db..0000000 --- a/vector_etl/source_mods/zoho_crm_loader.py +++ /dev/null @@ -1,162 +0,0 @@ -from .base import BaseSource -import pandas as pd -import requests -import logging -from pprint import pprint -import os -import json - - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - - - - -class ZohoCrmSource(BaseSource): - def __init__(self,config): - self.config = config - self.token = None - self.url = None - self.grant_type = self.config['grant_type'] - self.client_id = self.config['client_id'] - self.client_secret = self.config['client_secret'] - self.code = self.config['code'] - self.accounts_url = self.config['accounts_url'] - - - def flatten_dict(self, d, parent_key='', sep='_'): - - items = [] - for k, v in d.items(): - new_key = f"{parent_key}{sep}{k}" if parent_key else k - if isinstance(v, dict): - items.extend(self.flatten_dict(v, new_key, sep=sep).items()) - else: - items.append((new_key, v)) - return dict(items) - - - def connect(self): - - data = { - "grant_type":self.grant_type, - "client_id": self.client_id, - "client_secret": self.client_secret, - "code": self.code - } - try: - if os.path.exists("token.json"): - with open("token.json",'r') as token_file: - token_data = json.load(token_file) - self.token = token_data.get("access_token") - return self.token - else: - response = requests.post(url=self.accounts_url, data=data) - logger.info(f"Status {response.status_code}") - with open("token.json", 'w') as token_file: - json.dump({"access_token": response.json()["access_token"]}, token_file) - - logger.info("New token fetched and saved.") - tokens = response.json()["access_token"] - return tokens - except requests.exceptions.HTTPError as http_err: - logger.error(f"connection Error {http_err}") - - - - def fetch_data(self): - - self.token = self.connect() - if self.config['records'] == 'module.Contacts': - logger.info("Contact \n") - self.url = f"""https://www.zohoapis.com/crm/v5/Contacts?fields=Acount_Name, - First_Name,Lead_Source,Home,Fax,Skype_ID,Asst_Phone,Phone, - Title,Department,Twitter,Last_Name,Contact_Name,Phone,Email,Reporting_To, - Mailing_Street,Mailing_City,Mailing_State,Mailing_Zip,Mailing_Country, - Description,Contact_Owner,Lead_Source,Date_of_Birth,Contact_Image - &converted=true&per_page={self.config['per_page']}""" - - elif self.config['records'] == 'module.Accounts': - logger.info("Accounts \n") - self.url = f"""https://www.zohoapis.com/crm/v5/Accounts?fields=Account_Owner,Account_Name,Account_Site,Parent_Account, - Account_Number,Account_Type,Industry,Annual_Revenue,Rating,Phone,Fax,Website,Ticker_Symbol,OwnerShip,Employees,Sic_Code, - Billing_Street,Billing_City,Billing_State,Billing_Code,Billing_Country,Shipping_Street,Shipping_City,Shipping_State,Shipping_Code, - Shipping_Country,Description - &converted=true&per_page={self.config['per_page']}""" - - - - - elif self.config['records'] == 'module.Leads': - logger.info("Leads \n") - self.url = f"""https://www.zohoapis.com/crm/v5/Leads?fields=Lead_Owner,First_Name,Title,Mobile,Lead_Source, - Industry,Annual_Revenue,Company,Last_Name,Email,Fax,Website,Lead_Status,Rating,Skype_ID, - Description,Twitter,City,Street,State,Country,Zip_Code,No_of_Employees - &converted=true&per_page={self.config['per_page']}""" - - elif self.config['records'] == 'module.Deals': - logger.info("Deals \n") - self.url = f"""https://www.zohoapis.com/crm/v5/Deals?fields=Deal_Owner,Deal_Name,Account_Name, - Type,Next_Step,Lead_Source,Contact_Name,Amount,Closing_Date,Stage,Probability,Expected_Revenue, - Campaign_Source,Description - &converted=true&per_page={self.config['per_page']}""" - - - - elif self.config['records'] == 'module.Campaigns': - logger.info("Campaigns \n") - self.url = f"""https://www.zohoapis.com/crm/v5/Campaigns?fields=Campaign_Owner,Campaign_Name,Start_Date, - Expected_Revenue,Actual_Cost,Number_sent,Type,Status,End_Date,Budgeted_Cost,Expected_Response,Description - &converted=true&per_page={self.config['per_page']}""" - - - - - elif self.config['records'] == 'module.Tasks': - logger.info("Tasks \n") - self.url = f"""https://www.zohoapis.com/crm/v5/Tasks?fields=Task_Owner,Subject,Due_Date,Contact,Deal,Status,Priority,Reminder, - Repeat,Description - &converted=true&per_page={self.config['per_page']}""" - - - - elif self.config['records'] == 'module.Calls': - logger.info("Calls \n") - self.url = f"""https://www.zohoapis.com/crm/v5/Calls?fields=Call_To,Related_To,Call_Type,Outgoing_Call_Status, - Call_Start_Time,Call_Owner,Subject,Created_By,Modified_By,Call_Purpose,Call_Agenda&converted=true&per_page={self.config['per_page']}""" - - - elif self.config['records'] == 'module': - logger.info("Calls \n") - self.url = "https://www.zohoapis.com/crm/v5/settings/modules," - - headers = {"Authorization":f"Zoho-oauthtoken {self.token}"} - - response = requests.get(url=self.url,headers=headers).json()['data'] - - flattened_data = [self.flatten_dict(item) for item in response] - - - df = pd.DataFrame(flattened_data ) - - logger.info(f" data \n {df}") - - return df - - - - - - - - - - - - - - - - diff --git a/vector_etl/source_mods/zoho_desk_loader.py b/vector_etl/source_mods/zoho_desk_loader.py deleted file mode 100644 index df073af..0000000 --- a/vector_etl/source_mods/zoho_desk_loader.py +++ /dev/null @@ -1,116 +0,0 @@ -from .base import BaseSource -import pandas as pd -import requests -import logging -from pprint import pprint -import os -import json - - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -class ZohoDeskSource(BaseSource): - def __init__(self,config): - self.config = config - self.token = None - self.url = None - self.grant_type = self.config['grant_type'] - self.client_id = self.config['client_id'] - self.client_secret = self.config['client_secret'] - self.code = self.config['code'] - self.accounts_url = self.config['accounts_url'] - - - def flatten_dict(self, d, parent_key='', sep='_'): - - items = [] - for k, v in d.items(): - new_key = f"{parent_key}{sep}{k}" if parent_key else k - if isinstance(v, dict): - items.extend(self.flatten_dict(v, new_key, sep=sep).items()) - else: - items.append((new_key, v)) - return dict(items) - - - def connect(self): - - data = { - "grant_type":self.grant_type, - "client_id": self.client_id, - "client_secret": self.client_secret, - "code": self.code - } - try: - if os.path.exists("token.json"): - with open("token.json",'r') as token_file: - token_data = json.load(token_file) - self.token = token_data.get("access_token") - return self.token - else: - response = requests.post(url=self.accounts_url, data=data) - logger.info(f"Status {response.status_code}") - with open("token.json", 'w') as token_file: - json.dump({"access_token": response.json()["access_token"]}, token_file) - - logger.info("New token fetched and saved.") - tokens = response.json()["access_token"] - return tokens - except requests.exceptions.HTTPError as http_err: - logger.error(f"connection Error {http_err}") - - - - def fetch_data(self): - - self.token = self.connect() - if self.config['records'] == 'desk.agents': - logger.info("Agents \n") - self.url = f"https://desk.zoho.com/api/v1/agents" - - - elif self.config['records'] == 'desk.team': - logger.info("Teams \n") - self.url = f"https://desk.zoho.com/api/v1/teams" - headers = {"Authorization":f"Zoho-oauthtoken {self.token}"} - - response = requests.get(url=self.url,headers=headers).json()['teams'] - - flattened_data = [self.flatten_dict(item) for item in response] - - - df = pd.DataFrame(flattened_data ) - - logger.info(f" data \n {df}") - - return df - - - - elif self.config['records'] == 'desk.ticket': - logger.info("Ticket \n") - self.url = f"""https://desk.zoho.com/api/v1/tickets?include=contacts, - assignee,departments,team,isRead""" - - - elif self.config['records'] == 'desk.contacts': - logger.info("Contact \n") - self.url = f"https://desk.zoho.com/api/v1/contacts" - - - - - - - - - - - - - - - - From d59de562bd7a3f16389e9d67b0aaadb59b0dae2e Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Tue, 20 Aug 2024 13:11:04 -0800 Subject: [PATCH 31/34] updated base module import --- vector_etl/source_mods/airtable_loader.py | 4 +++ vector_etl/source_mods/google_bigquery.py | 39 +++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 vector_etl/source_mods/google_bigquery.py diff --git a/vector_etl/source_mods/airtable_loader.py b/vector_etl/source_mods/airtable_loader.py index 35889c1..4eaf933 100644 --- a/vector_etl/source_mods/airtable_loader.py +++ b/vector_etl/source_mods/airtable_loader.py @@ -1,5 +1,9 @@ import requests from .base import BaseSource +<<<<<<< HEAD +======= +from pprint import pprint +>>>>>>> f842a37 (updated base module import) import pandas as pd import logging diff --git a/vector_etl/source_mods/google_bigquery.py b/vector_etl/source_mods/google_bigquery.py new file mode 100644 index 0000000..affa5f9 --- /dev/null +++ b/vector_etl/source_mods/google_bigquery.py @@ -0,0 +1,39 @@ +import os +from google.cloud import bigquery +from .base import BaseSource +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class GoogleBigQuerySource(BaseSource): + def __init__(self,config): + self.config = config + self.client = None + self.connect() + + + def connect(self): + if self.config["db_type"] == 'google_bigquery': + os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = self.config['GOOGLE_APPLICATION_CREDENTIALS'] + self.client = bigquery.Client() + + def fetch_data(self): + if self.client: + try: + query_job = self.client.query(f"""{self.config.get("query"," ")}""") + if query_job: + dfrows = query_job.result().to_dataframe() + return dfrows + else: + logger.error(f"No data returned: {e}") + return None + except Exception as e: + logger.error(f"An error occurred: {e}") + return None + + + + + From 8e5dda327fde7dd7fbed2700e8fd2f9caf4b22c6 Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Fri, 13 Sep 2024 15:52:33 -0800 Subject: [PATCH 32/34] remove secret key --- tests/test_source_mods.py | 240 ++++++----------------------- vector_etl/source_mods/__init__.py | 21 --- 2 files changed, 46 insertions(+), 215 deletions(-) diff --git a/tests/test_source_mods.py b/tests/test_source_mods.py index d9eb027..e81f594 100644 --- a/tests/test_source_mods.py +++ b/tests/test_source_mods.py @@ -5,16 +5,10 @@ from vector_etl.source_mods.s3_loader import S3Source from vector_etl.source_mods.database_loader import DatabaseSource from vector_etl.source_mods.local_file import LocalFileSource -from vector_etl.source_mods.google_bigquery import GoogleBigQuerySource from vector_etl.source_mods.airtable_loader import AirTableSource from vector_etl.source_mods.hubspot_loader import HubSpotSource from vector_etl.source_mods.intercom_loader import InterComSource -from vector_etl.source_mods.paystack_loader import PayStackSource -from vector_etl.source_mods.zoho_crm_loader import ZohoCrmSource -from vector_etl.source_mods.zoho_desk_loader import ZohoDeskSource -from vector_etl.source_mods.flutterwave_loader import FlutterWaveSource -from vector_etl.source_mods.gmail_loader import GmailSource -from vector_etl.source_mods.mailchimp_loader import MailChimpMarketingSource +from vector_etl.source_mods.digital_ocean_spaces_loader import DigitalOceanSpaceSource @pytest.fixture def s3_config(): @@ -27,14 +21,21 @@ def s3_config(): 'chunk_overlap': 200 } + @pytest.fixture -def google_bigquery_config(): +def digital_ocean_config(): return { - "source_data_type": "Google BigQuery", - "google_application_credentials": "", - "query": "SELECT * FROM chipotle_stores LIMIT 10" - + 'aws_access_key_id': 'test_key', + 'endpoint_url':'text_endpoint', + 'region_name':'text_region', + 'aws_secret_access_key': 'test_secret', + 'bucket_name': 'test_bucket', + 'prefix': 'test_prefix/', + 'chunk_size': 1000, + 'chunk_overlap': 200 } + + @pytest.fixture @@ -47,40 +48,7 @@ def airtable_config(): } -@pytest.fixture -def gmail_config(): - return { - 'credentials': 'credentials.json', ## path to gmail crendtials - 'gmail.label': 'IMPORTANT' # Specify the label in the config - } - - - - -@pytest.fixture -def zohodesk_config(): - return{ - "grant_type":"", - "client_id": "", - "client_secret": "", - "code": "", - "limit":"", - "records":"desk.team", - "accounts_url":"" - } - -@pytest.fixture -def zohocrm_config(): - return{ - "grant_type":"", - "client_id": "", - "client_secret": "", - "code": "", - "per_page":"10", - "records":"module.Call", - "accounts_url":"" - } @pytest.fixture @@ -93,21 +61,6 @@ def hubspot_config(): } -@pytest.fixture -def paystack_config(): - return{ - "paystack_secret_key":"", - "records": "paystack.transactions", - } - - - -@pytest.fixture -def flutterwave_config(): - return{ - "secret_key":"", - "records": "flutterwave.payout-subaccounts", - } @pytest.fixture def intercom_config(): @@ -140,13 +93,8 @@ def local_file_config(): } -@pytest.fixture -def mailchimp_config(): - return { - 'api_key': 'test_key', - 'server': 'test_secret', - 'records': 'test_bucket', - } + + def test_s3_source_connect(s3_config): with patch('boto3.client') as mock_client: @@ -210,23 +158,6 @@ def test_local_file_source_read_file(local_file_config): assert isinstance(file_content, BytesIO) -def test_google_bigquery_connect(google_bigquery_config): - with patch('bigquery.connect') as mock_connect: - source = GoogleBigQuerySource(google_bigquery_config) - source.connect() - mock_connect.assert_called_once_with( - source_data_type="Google BigQuery", - google_application_credentials="", - query="SELECT * FROM chipotle_stores LIMIT 10" - ) - - -def test_google_bigquery_fetch_data(google_bigquery_config): - with patch('bigquery.connect') as mock_connect: - mock_connect.result.to_dataframe.return_value = pd.DataFrame() - source = GoogleBigQuerySource(google_bigquery_config) - df = source.fetch_data() - assert isinstance(df, pd.DataFrame) def test_airtable_connect(airtable_config): @@ -257,77 +188,7 @@ def test_airtable_fetch_data(airtable_config): assert isinstance(df, pd.DataFrame) - -def test_zohodesk_connect(zohodesk_config): - - with patch('requests.get') as mock_connect: - source = ZohoDeskSource(zohodesk_config) - source.connect() - mock_connect.assert_called_once_with( - grant_type="", - client_id = "", - client_secret="", - code="", - accounts_url="" - ) - -def test_zohodesk_fetch_data(zohodesk_config): - with patch('requests.get') as mock_connect: - mock_connect.return_value = [ { - "Address": "333 Post St", - "Name": "Union Square", - "Visited": True - } - ] - - source = ZohoDeskSource(zohodesk_config) - df = source.fetch_data() - - assert isinstance(df, pd.DataFrame) - -def test_zohocrm_connect(zohocrm_config): - - with patch('requests.get') as mock_connect: - source = ZohoCrmSource(zohocrm_config) - source.connect() - mock_connect.assert_called_once_with( - grant_type="", - client_id = "", - client_secret="", - code="", - accounts_url="" - ) - - -def test_zohocrm_fetch_data(zohocrm_config): - with patch('requests.get') as mock_connect: - mock_connect.return_value = [ { } - ] - - source = ZohoCrmSource(zohocrm_config) - df = source.fetch_data() - - assert isinstance(df, pd.DataFrame) - -def test_paystack_connect(paystack_config): - - with patch('requests.get') as mock_connect: - source = PayStackSource(paystack_config) - source.connect() - mock_connect.assert_called_once_with( - paystack_secret_key="", - ) - - -def test_paystack_fetch_data(paystack_config): - with patch('Paystack') as mock_connect: - mock_connect.return_value = [{}] - - source = PayStackSource(paystack_config) - df = source.fetch_data() - - assert isinstance(df, pd.DataFrame) def test_intercom_connect(intercom_config): @@ -340,78 +201,69 @@ def test_intercom_connect(intercom_config): ) -def test_intercom_fetch_data(intercom_config): +def test_hubspot_fetch_data(hubspot_config): with patch('requests.get') as mock_connect: mock_connect.return_value = [{}] - source = InterComSource(intercom_config) + source = HubSpotSource(hubspot_config) df = source.fetch_data() assert isinstance(df, pd.DataFrame) - -def test_flutterwave_connect(flutterwave_config): + + +def test_hubspot_connect(hubspot_config): with patch('requests.get') as mock_connect: - source = FlutterWaveSource(flutterwave_config) + source = HubSpotSource(hubspot_config) source.connect() mock_connect.assert_called_once_with( secret_key="", ) -def test_flutterwave_fetch_data(flutterwave_config): +def test_intercom_fetch_data(intercom_config): with patch('requests.get') as mock_connect: mock_connect.return_value = [{}] - source = FlutterWaveSource(flutterwave_config) + source = InterComSource(intercom_config) df = source.fetch_data() assert isinstance(df, pd.DataFrame) + - -def test_gmail_connect(gmail_config): - - with patch('InstalledAppFlow.from_client_secrets_file') as mock_connect: - source = GmailSource(gmail_config) +def test_digital_ocean_source_connect(s3_config): + with patch('boto3.client') as mock_client: + source = DigitalOceanSpaceSource(s3_config) source.connect() - mock_connect.assert_called_once_with( - credentials="credential.json", - ) + mock_client.assert_called_once_with( + 's3', + aws_access_key_id='test_key', + aws_secret_access_key='test_secret', + endpoint_url='text_endpoint', + region_name='text_region', + ) +def test_s3_digital_ocean_list_files(s3_config): + with patch('boto3.client') as mock_client: + mock_paginator = Mock() + mock_paginator.paginate.return_value = [ + {'Contents': [{'Key': 'test_prefix/file1.csv'}, {'Key': 'test_prefix/file2.csv'}]} + ] + mock_client.return_value.get_paginator.return_value = mock_paginator -def test_gmail_fetch_data(gmail_config): - with patch('requests.get') as mock_connect: - mock_connect.return_value = [{}] - source = GmailSource(gmail_config) - df = source.fetch_data() + source = DigitalOceanSpaceSource(s3_config) + source.connect() + files = source.list_files() - assert isinstance(df, pd.DataFrame) + assert files == ['test_prefix/file1.csv', 'test_prefix/file2.csv'] + -def test_mailchimp_connect(mailchimp_config): - - with patch('MailchimpMarketing.Client.set_config') as mock_connect: - source = MailChimpMarketingSource(mailchimp_config) - source.connect() - mock_connect.assert_called_once_with( - api_key="", - server="" - ) - - -def test_mailchimp_fetch_data(mailchimp_config): - with patch('MailchimpMarketing.Client.set_config') as mock_connect: - mock_connect.return_value = [{}] - source = MailChimpMarketingSource(mailchimp_config) - df = source.fetch_data() - - assert isinstance(df, pd.DataFrame) - diff --git a/vector_etl/source_mods/__init__.py b/vector_etl/source_mods/__init__.py index c4687c4..5274afe 100644 --- a/vector_etl/source_mods/__init__.py +++ b/vector_etl/source_mods/__init__.py @@ -10,15 +10,8 @@ from .google_cloud_storage import GoogleCloudStorageSource from .local_file import LocalFileSource from .airtable_loader import AirTableSource -from .google_bigquery import GoogleBigQuerySource from .hubspot_loader import HubSpotSource -from .zoho_crm_loader import ZohoCrmSource -from .zoho_desk_loader import ZohoDeskSource from .intercom_loader import InterComSource -from .paystack_loader import PayStackSource -from .flutterwave_loader import FlutterWaveSource -from .gmail_loader import GmailSource -from .mailchimp_loader import MailChimpMarketingSource from .digital_ocean_spaces_loader import DigitalOceanSpaceSource def get_source_class(config): @@ -43,24 +36,10 @@ def get_source_class(config): return GoogleCloudStorageSource(config) elif source_type == 'AirTable': return AirTableSource(config) - elif source_type == 'Google BigQuery': - return GoogleBigQuerySource(config) elif source_type == 'HubSpot': return HubSpotSource(config) - elif source_type == 'ZohoCrm': - return ZohoCrmSource(config) - elif source_type == 'ZohoDesk': - return ZohoDeskSource(config) elif source_type == "InterCom": return InterComSource(config) - elif source_type == 'PayStacks': - return PayStackSource(config) - elif source_type == "FlutterWave": - return FlutterWaveSource(config) - elif source_type == "Gmail": - return GmailSource(config) - elif source_type == "MailChimp": - return MailChimpMarketingSource(config) elif source_type == "DigitalOcean": return DigitalOceanSpaceSource(config) else: From a3ff626f3a6517ceeebebb5bf8db6e232aa24cbd Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Sun, 15 Sep 2024 12:51:03 -0800 Subject: [PATCH 33/34] accept incoming merged changes --- vector_etl/source_mods/airtable_loader.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vector_etl/source_mods/airtable_loader.py b/vector_etl/source_mods/airtable_loader.py index 4eaf933..0e14b58 100644 --- a/vector_etl/source_mods/airtable_loader.py +++ b/vector_etl/source_mods/airtable_loader.py @@ -1,9 +1,6 @@ import requests from .base import BaseSource -<<<<<<< HEAD -======= from pprint import pprint ->>>>>>> f842a37 (updated base module import) import pandas as pd import logging From 9c048719972cca60c56c0ad5f9129a20fe508983 Mon Sep 17 00:00:00 2001 From: owolabi-develop Date: Sun, 15 Sep 2024 13:08:16 -0800 Subject: [PATCH 34/34] added airtable, hubspot, digitalocean and intercom sources sample yaml config --- README.md | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/README.md b/README.md index ed2588f..1f2f76f 100644 --- a/README.md +++ b/README.md @@ -307,6 +307,46 @@ source: chunk_overlap: 0 ``` +##### DigitalOcean Source +```yaml +source: + source_data_type: "DigitalOcean" + bucket_name: "my-bucket" + key: "path/to/files/" + file_type: ".csv" + region_name: 'your region' + endpoint_url: 'your endpointurl' + aws_access_key_id: "your-access-key" + aws_secret_access_key: "your-secret-key" +``` + +##### Airtable Source +```yaml +source: + source_data_type: "AirTable" + auth_token: "" + baseId: "" + tableIdOrName: "" +``` + +##### Hubspot Source +```yaml +source: + source_data_type: "HubSpot" + archive: "false" + limit: "100" + access_token: "" + crm_object: "crm.contacts" +``` + +##### Intercom Source +```yaml +source: + source_data_type: "InterCom" + token: "" + records: "intercom.teams" +``` + #### Using Unstructured to process source files Starting from version 0.1.6.3, you can now add Unstructured as file processing API. Users can now utilize the [Unstructured's Serverless API](https://unstructured.io/api-key-hosted) to efficiently extract data from a multitude of file based sources.