diff --git a/Applied Data Science Capstone/Week 3/Final Assignment - Segmenting and Clustering Neighborhoods in Toronto/Part 1.ipynb b/Applied Data Science Capstone/Week 3/Final Assignment - Segmenting and Clustering Neighborhoods in Toronto/Part 1.ipynb
index 7f13ff4..8e873e5 100644
--- a/Applied Data Science Capstone/Week 3/Final Assignment - Segmenting and Clustering Neighborhoods in Toronto/Part 1.ipynb
+++ b/Applied Data Science Capstone/Week 3/Final Assignment - Segmenting and Clustering Neighborhoods in Toronto/Part 1.ipynb
@@ -1,423 +1,149 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Segmenting and Clustering Neighbourhoods in Toronto"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Importing libraries "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "import requests"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Get the HTML page of Wiki, and using read_html we convert the html data into list of Data frame objects."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Remove cells which have borrow not assigned.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " index | \n",
- " Postal Code | \n",
- " Borough | \n",
- " Neighborhood | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 2 | \n",
- " M3A | \n",
- " North York | \n",
- " Parkwoods | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 3 | \n",
- " M4A | \n",
- " North York | \n",
- " Victoria Village | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 4 | \n",
- " M5A | \n",
- " Downtown Toronto | \n",
- " Regent Park, Harbourfront | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 5 | \n",
- " M6A | \n",
- " North York | \n",
- " Lawrence Manor, Lawrence Heights | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 6 | \n",
- " M7A | \n",
- " Downtown Toronto | \n",
- " Queen's Park, Ontario Provincial Government | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " index Postal Code Borough \\\n",
- "0 2 M3A North York \n",
- "1 3 M4A North York \n",
- "2 4 M5A Downtown Toronto \n",
- "3 5 M6A North York \n",
- "4 6 M7A Downtown Toronto \n",
- "\n",
- " Neighborhood \n",
- "0 Parkwoods \n",
- "1 Victoria Village \n",
- "2 Regent Park, Harbourfront \n",
- "3 Lawrence Manor, Lawrence Heights \n",
- "4 Queen's Park, Ontario Provincial Government "
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'\n",
- "wiki_page = requests.get(wiki)\n",
- "\n",
- "wiki_raw = pd.read_html(wiki_page.content, header = 0)[0]\n",
- "df = wiki_raw[wiki_raw.Neighborhood != 'Not assigned']\n",
- "df.reset_index(inplace = True)\n",
- "df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " index | \n",
- " Borough | \n",
- " Neighborhood | \n",
- "
\n",
- " \n",
- " | Postal Code | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | M1B | \n",
- " 9 | \n",
- " Scarborough | \n",
- " Malvern, Rouge | \n",
- "
\n",
- " \n",
- " | M1C | \n",
- " 18 | \n",
- " Scarborough | \n",
- " Rouge Hill, Port Union, Highland Creek | \n",
- "
\n",
- " \n",
- " | M1E | \n",
- " 27 | \n",
- " Scarborough | \n",
- " Guildwood, Morningside, West Hill | \n",
- "
\n",
- " \n",
- " | M1G | \n",
- " 36 | \n",
- " Scarborough | \n",
- " Woburn | \n",
- "
\n",
- " \n",
- " | M1H | \n",
- " 45 | \n",
- " Scarborough | \n",
- " Cedarbrae | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | M9N | \n",
- " 98 | \n",
- " York | \n",
- " Weston | \n",
- "
\n",
- " \n",
- " | M9P | \n",
- " 107 | \n",
- " Etobicoke | \n",
- " Westmount | \n",
- "
\n",
- " \n",
- " | M9R | \n",
- " 116 | \n",
- " Etobicoke | \n",
- " Kingsview Village, St. Phillips, Martin Grove ... | \n",
- "
\n",
- " \n",
- " | M9V | \n",
- " 143 | \n",
- " Etobicoke | \n",
- " South Steeles, Silverstone, Humbergate, Jamest... | \n",
- "
\n",
- " \n",
- " | M9W | \n",
- " 152 | \n",
- " Etobicoke | \n",
- " Northwest, West Humber - Clairville | \n",
- "
\n",
- " \n",
- "
\n",
- "
103 rows × 3 columns
\n",
- "
"
- ],
- "text/plain": [
- " index Borough \\\n",
- "Postal Code \n",
- "M1B 9 Scarborough \n",
- "M1C 18 Scarborough \n",
- "M1E 27 Scarborough \n",
- "M1G 36 Scarborough \n",
- "M1H 45 Scarborough \n",
- "... ... ... \n",
- "M9N 98 York \n",
- "M9P 107 Etobicoke \n",
- "M9R 116 Etobicoke \n",
- "M9V 143 Etobicoke \n",
- "M9W 152 Etobicoke \n",
- "\n",
- " Neighborhood \n",
- "Postal Code \n",
- "M1B Malvern, Rouge \n",
- "M1C Rouge Hill, Port Union, Highland Creek \n",
- "M1E Guildwood, Morningside, West Hill \n",
- "M1G Woburn \n",
- "M1H Cedarbrae \n",
- "... ... \n",
- "M9N Weston \n",
- "M9P Westmount \n",
- "M9R Kingsview Village, St. Phillips, Martin Grove ... \n",
- "M9V South Steeles, Silverstone, Humbergate, Jamest... \n",
- "M9W Northwest, West Humber - Clairville \n",
- "\n",
- "[103 rows x 3 columns]"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.groupby(['Postal Code']).first()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The new Wiki link already has nieghborhood merged according to Postal Code and Borough\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "103"
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(df['Postal Code'].unique())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " index | \n",
- " Postal Code | \n",
- " Borough | \n",
- " Neighborhood | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: [index, Postal Code, Borough, Neighborhood]\n",
- "Index: []"
- ]
- },
- "execution_count": 44,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df['Borough'] == 'Not assigned']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(103, 4)"
- ]
- },
- "execution_count": 45,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.6"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": "# Segmenting and Clustering Neighbourhoods in Toronto"
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": "Importing libraries "
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": "import pandas as pd\nimport numpy as np\nimport requests"
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": "Get the HTML page of Wiki, and using read_html we convert the html data into list of Data frame objects."
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": "Remove cells which have borrow not assigned.\n"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": "\n\n
\n \n \n | \n index | \n variable | \n value | \n
\n \n \n \n | 0 | \n 1 | \n 0 | \n M1BScarborough(Malvern / Rouge) | \n
\n \n | 1 | \n 2 | \n 0 | \n M1CScarborough(Rouge Hill / Port Union / Highl... | \n
\n \n | 2 | \n 3 | \n 0 | \n M1EScarborough(Guildwood / Morningside / West ... | \n
\n \n | 3 | \n 4 | \n 0 | \n M1GScarborough(Woburn) | \n
\n \n | 4 | \n 5 | \n 0 | \n M1HScarborough(Cedarbrae) | \n
\n \n | ... | \n ... | \n ... | \n ... | \n
\n \n | 98 | \n 170 | \n 8 | \n M9NYork(Weston) | \n
\n \n | 99 | \n 171 | \n 8 | \n M9PEtobicoke(Westmount) | \n
\n \n | 100 | \n 172 | \n 8 | \n M9REtobicoke(Kingsview Village / St. Phillips ... | \n
\n \n | 101 | \n 175 | \n 8 | \n M9VEtobicoke(South Steeles / Silverstone / Hum... | \n
\n \n | 102 | \n 176 | \n 8 | \n M9WEtobicokeNorthwest(Clairville / Humberwood ... | \n
\n \n
\n
103 rows \u00d7 3 columns
\n
",
+ "text/plain": " index variable value\n0 1 0 M1BScarborough(Malvern / Rouge)\n1 2 0 M1CScarborough(Rouge Hill / Port Union / Highl...\n2 3 0 M1EScarborough(Guildwood / Morningside / West ...\n3 4 0 M1GScarborough(Woburn)\n4 5 0 M1HScarborough(Cedarbrae)\n.. ... ... ...\n98 170 8 M9NYork(Weston)\n99 171 8 M9PEtobicoke(Westmount)\n100 172 8 M9REtobicoke(Kingsview Village / St. Phillips ...\n101 175 8 M9VEtobicoke(South Steeles / Silverstone / Hum...\n102 176 8 M9WEtobicokeNorthwest(Clairville / Humberwood ...\n\n[103 rows x 3 columns]"
+ },
+ "execution_count": 82,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": "wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'\nwiki_page = requests.get(wiki)\nwiki_raw = pd.read_html(wiki_page.content)[0]\n#Check wiki URL to understand Code below\ndf=wiki_raw.melt() #Melting grid to single column\ndf=df[df.value.str.contains(\"Not assigned\")==False]\ndf.reset_index(inplace = True)\n#Splitting value into Borough,Neighborhood,Postalcode\ndf['Postal Code'] = df['value'].map(lambda x: x[0:3]) #Extracting First 3 Characters i.e Postal Code \ndf['value']=df['value'].map(lambda x: x[3:]) #Isolating the postal code to extract Borough,Neighborhood by splitting with \"(\"\ndf[['Borough','Neighborhood','Empty']] =df.value.str.split('(',expand=True)\ndf['Neighborhood']=df['Neighborhood'].str.replace(')', '', regex=False) #Removing ) from Neighborhood\ndf['Neighborhood']=df['Neighborhood'].str.replace('/', ',', regex=False) #Replacing / with \"\"\ndf.drop(columns = ['variable','value','Empty'],inplace = True)"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": "\n\n
\n \n \n | \n index | \n Borough | \n Neighborhood | \n
\n \n | Postal Code | \n | \n | \n | \n
\n \n \n \n | M1B | \n 1 | \n Scarborough | \n Malvern , Rouge | \n
\n \n | M1C | \n 2 | \n Scarborough | \n Rouge Hill , Port Union , Highland Creek | \n
\n \n | M1E | \n 3 | \n Scarborough | \n Guildwood , Morningside , West Hill | \n
\n \n | M1G | \n 4 | \n Scarborough | \n Woburn | \n
\n \n | M1H | \n 5 | \n Scarborough | \n Cedarbrae | \n
\n \n | ... | \n ... | \n ... | \n ... | \n
\n \n | M9N | \n 170 | \n York | \n Weston | \n
\n \n | M9P | \n 171 | \n Etobicoke | \n Westmount | \n
\n \n | M9R | \n 172 | \n Etobicoke | \n Kingsview Village , St. Phillips , Martin Grov... | \n
\n \n | M9V | \n 175 | \n Etobicoke | \n South Steeles , Silverstone , Humbergate , Jam... | \n
\n \n | M9W | \n 176 | \n EtobicokeNorthwest | \n Clairville , Humberwood , Woodbine Downs , Wes... | \n
\n \n
\n
103 rows \u00d7 3 columns
\n
",
+ "text/plain": " index Borough \\\nPostal Code \nM1B 1 Scarborough \nM1C 2 Scarborough \nM1E 3 Scarborough \nM1G 4 Scarborough \nM1H 5 Scarborough \n... ... ... \nM9N 170 York \nM9P 171 Etobicoke \nM9R 172 Etobicoke \nM9V 175 Etobicoke \nM9W 176 EtobicokeNorthwest \n\n Neighborhood \nPostal Code \nM1B Malvern , Rouge \nM1C Rouge Hill , Port Union , Highland Creek \nM1E Guildwood , Morningside , West Hill \nM1G Woburn \nM1H Cedarbrae \n... ... \nM9N Weston \nM9P Westmount \nM9R Kingsview Village , St. Phillips , Martin Grov... \nM9V South Steeles , Silverstone , Humbergate , Jam... \nM9W Clairville , Humberwood , Woodbine Downs , Wes... \n\n[103 rows x 3 columns]"
+ },
+ "execution_count": 78,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": "df.groupby(['Postal Code']).first()"
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": "The new Wiki link already has nieghborhood merged according to Postal Code and Borough\n"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": "103"
+ },
+ "execution_count": 79,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": "len(df['Postal Code'].unique())"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": "\n\n
\n \n \n | \n index | \n Postal Code | \n Borough | \n Neighborhood | \n
\n \n \n \n
\n
",
+ "text/plain": "Empty DataFrame\nColumns: [index, Postal Code, Borough, Neighborhood]\nIndex: []"
+ },
+ "execution_count": 80,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": "df[df['Borough'] == 'Not assigned']"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": "(103, 4)"
+ },
+ "execution_count": 81,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": "df.shape"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": ""
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3.7",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
}