Skip to content

Commit a840192

Browse files
committed
init
0 parents  commit a840192

File tree

4 files changed

+74
-0
lines changed

4 files changed

+74
-0
lines changed

contents.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

data.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

youtube_search_scraper.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# youtube search scraper
2+
import requests
3+
import json
4+
from bs4 import BeautifulSoup
5+
import re
6+
7+
response = requests.get(
8+
"https://www.youtube.com/results?search_query=indonesia").text
9+
10+
soup = BeautifulSoup(response, 'lxml')
11+
12+
try:
13+
script = soup.find_all('script', text=re.compile('ytInitialData'))
14+
json_text = re.search(
15+
r'ytInitialData = ({.*?});', script[0].string, re.DOTALL).group(1)
16+
with open('data.json', 'w', encoding="utf-8") as outfile:
17+
outfile.write(json_text)
18+
json_data = json.loads(json_text)
19+
contents = json_data['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents']
20+
with open('contents.json', 'w', encoding="utf-8") as outfile:
21+
outfile.write(json.dumps(contents))
22+
for i, content in enumerate(contents):
23+
if 'itemSectionRenderer' in content:
24+
for item in content['itemSectionRenderer']['contents']:
25+
if 'videoRenderer' in item:
26+
video = item['videoRenderer']
27+
title = video['title']['runs'][0]['text']
28+
video_id = video['videoId']
29+
published_time = video['publishedTimeText']['simpleText']
30+
view_count = video['viewCountText']['simpleText']
31+
obj = {
32+
"title": title,
33+
"video_id": video_id,
34+
"published_time": published_time,
35+
"view_count": view_count,
36+
}
37+
print("Video:", obj)
38+
elif 'playlistRenderer' in item:
39+
playlist = item['playlistRenderer']
40+
title = playlist['title']['simpleText']
41+
playlist_id = playlist['playlistId']
42+
video_count = playlist['videoCount']
43+
obj = {
44+
"title": title,
45+
"playlist_id": playlist_id,
46+
"video_count": video_count,
47+
}
48+
print("Playlist:", obj)
49+
except Exception as e:
50+
print(str(e))

youtube_search_scraper_scroll.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import time
2+
3+
from selenium import webdriver
4+
from selenium.webdriver.common.keys import Keys
5+
from selenium.webdriver.common.by import By
6+
7+
browser = webdriver.Chrome()
8+
9+
browser.get("https://www.youtube.com/results?search_query=indonesia")
10+
time.sleep(1)
11+
12+
print(browser.find_elements(By.XPATH, "//div[@id='contents']"))
13+
14+
# no_of_pagedowns = 20
15+
16+
# while no_of_pagedowns:
17+
# elem.send_keys(Keys.PAGE_DOWN)
18+
# time.sleep(0.2)
19+
# no_of_pagedowns-=1
20+
21+
# for post in post_elems:
22+
# print(post.text)

0 commit comments

Comments
 (0)