|
| 1 | +# youtube search scraper |
| 2 | +import requests |
| 3 | +import json |
| 4 | +from bs4 import BeautifulSoup |
| 5 | +import re |
| 6 | + |
| 7 | +response = requests.get( |
| 8 | + "https://www.youtube.com/results?search_query=indonesia").text |
| 9 | + |
| 10 | +soup = BeautifulSoup(response, 'lxml') |
| 11 | + |
| 12 | +try: |
| 13 | + script = soup.find_all('script', text=re.compile('ytInitialData')) |
| 14 | + json_text = re.search( |
| 15 | + r'ytInitialData = ({.*?});', script[0].string, re.DOTALL).group(1) |
| 16 | + with open('data.json', 'w', encoding="utf-8") as outfile: |
| 17 | + outfile.write(json_text) |
| 18 | + json_data = json.loads(json_text) |
| 19 | + contents = json_data['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'] |
| 20 | + with open('contents.json', 'w', encoding="utf-8") as outfile: |
| 21 | + outfile.write(json.dumps(contents)) |
| 22 | + for i, content in enumerate(contents): |
| 23 | + if 'itemSectionRenderer' in content: |
| 24 | + for item in content['itemSectionRenderer']['contents']: |
| 25 | + if 'videoRenderer' in item: |
| 26 | + video = item['videoRenderer'] |
| 27 | + title = video['title']['runs'][0]['text'] |
| 28 | + video_id = video['videoId'] |
| 29 | + published_time = video['publishedTimeText']['simpleText'] |
| 30 | + view_count = video['viewCountText']['simpleText'] |
| 31 | + obj = { |
| 32 | + "title": title, |
| 33 | + "video_id": video_id, |
| 34 | + "published_time": published_time, |
| 35 | + "view_count": view_count, |
| 36 | + } |
| 37 | + print("Video:", obj) |
| 38 | + elif 'playlistRenderer' in item: |
| 39 | + playlist = item['playlistRenderer'] |
| 40 | + title = playlist['title']['simpleText'] |
| 41 | + playlist_id = playlist['playlistId'] |
| 42 | + video_count = playlist['videoCount'] |
| 43 | + obj = { |
| 44 | + "title": title, |
| 45 | + "playlist_id": playlist_id, |
| 46 | + "video_count": video_count, |
| 47 | + } |
| 48 | + print("Playlist:", obj) |
| 49 | +except Exception as e: |
| 50 | + print(str(e)) |
0 commit comments