diff --git a/Applied Data Science Capstone/Week 3/Final Assignment - Segmenting and Clustering Neighborhoods in Toronto/Part 1.ipynb b/Applied Data Science Capstone/Week 3/Final Assignment - Segmenting and Clustering Neighborhoods in Toronto/Part 1.ipynb index 7f13ff4..8e873e5 100644 --- a/Applied Data Science Capstone/Week 3/Final Assignment - Segmenting and Clustering Neighborhoods in Toronto/Part 1.ipynb +++ b/Applied Data Science Capstone/Week 3/Final Assignment - Segmenting and Clustering Neighborhoods in Toronto/Part 1.ipynb @@ -1,423 +1,149 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Segmenting and Clustering Neighbourhoods in Toronto" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Importing libraries " - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import requests" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get the HTML page of Wiki, and using read_html we convert the html data into list of Data frame objects." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Remove cells which have borrow not assigned.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexPostal CodeBoroughNeighborhood
02M3ANorth YorkParkwoods
13M4ANorth YorkVictoria Village
24M5ADowntown TorontoRegent Park, Harbourfront
35M6ANorth YorkLawrence Manor, Lawrence Heights
46M7ADowntown TorontoQueen's Park, Ontario Provincial Government
\n", - "
" - ], - "text/plain": [ - " index Postal Code Borough \\\n", - "0 2 M3A North York \n", - "1 3 M4A North York \n", - "2 4 M5A Downtown Toronto \n", - "3 5 M6A North York \n", - "4 6 M7A Downtown Toronto \n", - "\n", - " Neighborhood \n", - "0 Parkwoods \n", - "1 Victoria Village \n", - "2 Regent Park, Harbourfront \n", - "3 Lawrence Manor, Lawrence Heights \n", - "4 Queen's Park, Ontario Provincial Government " - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'\n", - "wiki_page = requests.get(wiki)\n", - "\n", - "wiki_raw = pd.read_html(wiki_page.content, header = 0)[0]\n", - "df = wiki_raw[wiki_raw.Neighborhood != 'Not assigned']\n", - "df.reset_index(inplace = True)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexBoroughNeighborhood
Postal Code
M1B9ScarboroughMalvern, Rouge
M1C18ScarboroughRouge Hill, Port Union, Highland Creek
M1E27ScarboroughGuildwood, Morningside, West Hill
M1G36ScarboroughWoburn
M1H45ScarboroughCedarbrae
............
M9N98YorkWeston
M9P107EtobicokeWestmount
M9R116EtobicokeKingsview Village, St. Phillips, Martin Grove ...
M9V143EtobicokeSouth Steeles, Silverstone, Humbergate, Jamest...
M9W152EtobicokeNorthwest, West Humber - Clairville
\n", - "

103 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " index Borough \\\n", - "Postal Code \n", - "M1B 9 Scarborough \n", - "M1C 18 Scarborough \n", - "M1E 27 Scarborough \n", - "M1G 36 Scarborough \n", - "M1H 45 Scarborough \n", - "... ... ... \n", - "M9N 98 York \n", - "M9P 107 Etobicoke \n", - "M9R 116 Etobicoke \n", - "M9V 143 Etobicoke \n", - "M9W 152 Etobicoke \n", - "\n", - " Neighborhood \n", - "Postal Code \n", - "M1B Malvern, Rouge \n", - "M1C Rouge Hill, Port Union, Highland Creek \n", - "M1E Guildwood, Morningside, West Hill \n", - "M1G Woburn \n", - "M1H Cedarbrae \n", - "... ... \n", - "M9N Weston \n", - "M9P Westmount \n", - "M9R Kingsview Village, St. Phillips, Martin Grove ... \n", - "M9V South Steeles, Silverstone, Humbergate, Jamest... \n", - "M9W Northwest, West Humber - Clairville \n", - "\n", - "[103 rows x 3 columns]" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.groupby(['Postal Code']).first()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The new Wiki link already has nieghborhood merged according to Postal Code and Borough\n" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "103" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(df['Postal Code'].unique())" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexPostal CodeBoroughNeighborhood
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [index, Postal Code, Borough, Neighborhood]\n", - "Index: []" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['Borough'] == 'Not assigned']" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(103, 4)" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": "# Segmenting and Clustering Neighbourhoods in Toronto" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "Importing libraries " + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": "import pandas as pd\nimport numpy as np\nimport requests" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "Get the HTML page of Wiki, and using read_html we convert the html data into list of Data frame objects." + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "Remove cells which have borrow not assigned.\n" + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexvariablevalue
010M1BScarborough(Malvern / Rouge)
120M1CScarborough(Rouge Hill / Port Union / Highl...
230M1EScarborough(Guildwood / Morningside / West ...
340M1GScarborough(Woburn)
450M1HScarborough(Cedarbrae)
............
981708M9NYork(Weston)
991718M9PEtobicoke(Westmount)
1001728M9REtobicoke(Kingsview Village / St. Phillips ...
1011758M9VEtobicoke(South Steeles / Silverstone / Hum...
1021768M9WEtobicokeNorthwest(Clairville / Humberwood ...
\n

103 rows \u00d7 3 columns

\n
", + "text/plain": " index variable value\n0 1 0 M1BScarborough(Malvern / Rouge)\n1 2 0 M1CScarborough(Rouge Hill / Port Union / Highl...\n2 3 0 M1EScarborough(Guildwood / Morningside / West ...\n3 4 0 M1GScarborough(Woburn)\n4 5 0 M1HScarborough(Cedarbrae)\n.. ... ... ...\n98 170 8 M9NYork(Weston)\n99 171 8 M9PEtobicoke(Westmount)\n100 172 8 M9REtobicoke(Kingsview Village / St. Phillips ...\n101 175 8 M9VEtobicoke(South Steeles / Silverstone / Hum...\n102 176 8 M9WEtobicokeNorthwest(Clairville / Humberwood ...\n\n[103 rows x 3 columns]" + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'\nwiki_page = requests.get(wiki)\nwiki_raw = pd.read_html(wiki_page.content)[0]\n#Check wiki URL to understand Code below\ndf=wiki_raw.melt() #Melting grid to single column\ndf=df[df.value.str.contains(\"Not assigned\")==False]\ndf.reset_index(inplace = True)\n#Splitting value into Borough,Neighborhood,Postalcode\ndf['Postal Code'] = df['value'].map(lambda x: x[0:3]) #Extracting First 3 Characters i.e Postal Code \ndf['value']=df['value'].map(lambda x: x[3:]) #Isolating the postal code to extract Borough,Neighborhood by splitting with \"(\"\ndf[['Borough','Neighborhood','Empty']] =df.value.str.split('(',expand=True)\ndf['Neighborhood']=df['Neighborhood'].str.replace(')', '', regex=False) #Removing ) from Neighborhood\ndf['Neighborhood']=df['Neighborhood'].str.replace('/', ',', regex=False) #Replacing / with \"\"\ndf.drop(columns = ['variable','value','Empty'],inplace = True)" + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
indexBoroughNeighborhood
Postal Code
M1B1ScarboroughMalvern , Rouge
M1C2ScarboroughRouge Hill , Port Union , Highland Creek
M1E3ScarboroughGuildwood , Morningside , West Hill
M1G4ScarboroughWoburn
M1H5ScarboroughCedarbrae
............
M9N170YorkWeston
M9P171EtobicokeWestmount
M9R172EtobicokeKingsview Village , St. Phillips , Martin Grov...
M9V175EtobicokeSouth Steeles , Silverstone , Humbergate , Jam...
M9W176EtobicokeNorthwestClairville , Humberwood , Woodbine Downs , Wes...
\n

103 rows \u00d7 3 columns

\n
", + "text/plain": " index Borough \\\nPostal Code \nM1B 1 Scarborough \nM1C 2 Scarborough \nM1E 3 Scarborough \nM1G 4 Scarborough \nM1H 5 Scarborough \n... ... ... \nM9N 170 York \nM9P 171 Etobicoke \nM9R 172 Etobicoke \nM9V 175 Etobicoke \nM9W 176 EtobicokeNorthwest \n\n Neighborhood \nPostal Code \nM1B Malvern , Rouge \nM1C Rouge Hill , Port Union , Highland Creek \nM1E Guildwood , Morningside , West Hill \nM1G Woburn \nM1H Cedarbrae \n... ... \nM9N Weston \nM9P Westmount \nM9R Kingsview Village , St. Phillips , Martin Grov... \nM9V South Steeles , Silverstone , Humbergate , Jam... \nM9W Clairville , Humberwood , Woodbine Downs , Wes... \n\n[103 rows x 3 columns]" + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "df.groupby(['Postal Code']).first()" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "The new Wiki link already has nieghborhood merged according to Postal Code and Borough\n" + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": "103" + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "len(df['Postal Code'].unique())" + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n
indexPostal CodeBoroughNeighborhood
\n
", + "text/plain": "Empty DataFrame\nColumns: [index, Postal Code, Borough, Neighborhood]\nIndex: []" + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "df[df['Borough'] == 'Not assigned']" + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": "(103, 4)" + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "df.shape" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.7", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 }