Skip to content

Commit 97de2c5

Browse files
committed
flask
1 parent 1da36c4 commit 97de2c5

File tree

10 files changed

+1741
-3919
lines changed

10 files changed

+1741
-3919
lines changed

.env

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# CLUSTER_API_ENDPOINT=https://10.10.72.54/v3/clusters
2+
# BEARER_TOKEN=token-lcp92:bwvvzk4cfzldvchzh9kk6pm8zpz7xrbtbpszxmj6rbmxsjl4b88dkl
3+
# NAMESPACE=tester
4+
# CLUSTER=c-rxtpn
5+
# MICROGEN_USER_URL=https://dev-khayanganjxhpv.microgen.id/api/user
6+
# GENERATE_USER=https://ujirnnbxgq.function.microgen.id/api/v1/generate/user
7+
# USER_BLOCKCHAIN_URL=https://dev-khayanganjxhpv.microgen.id/api/userBlockChains
8+
9+
CLUSTER_API_ENDPOINT=https://10.10.72.89/v3/clusters
10+
BEARER_TOKEN=token-vbqz2:drkj26x7d4f9rwx27cdmsthn5nkpjvdzl8rlhn92pxhdhzd2r9ccqd
11+
NAMESPACE=blockchain
12+
CLUSTER=c-zmj9v
13+
MICROGEN_USER_URL=https://dev-khayanganjxhpv.microgen.id/api/user
14+
GENERATE_USER=https://ujirnnbxgq.function.microgen.id/api/v1/generate/user
15+
USER_BLOCKCHAIN_URL=https://dev-khayanganjxhpv.microgen.id/api/userBlockChains

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# .env
2+
__pycache__

Dockerfile

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
FROM python:3.8-slim-buster
2+
3+
WORKDIR /app
4+
COPY requirements.txt requirements.txt
5+
RUN pip3 install -r requirements.txt
6+
COPY . .
7+
8+
ENV FLASK_RUN_HOST=0.0.0.0
9+
10+
CMD ["python3", "-m", "flask", "run"]

README copy.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# khayangan-blockchain

app.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import os
2+
from sys import stderr
3+
4+
from flask import Flask, jsonify, request
5+
from flask_cors import CORS
6+
7+
from yt_scraper_sroll import handler as yt_scraper_sroll_handler
8+
9+
app = Flask(__name__)
10+
CORS(app, resources={r"/*": {"origins": "*"}})
11+
12+
@app.route('/')
13+
def hello_geek():
14+
return '<h1>Hello from Flask</h2>'
15+
16+
@app.route('/youtube_scraper_scroll', methods=['POST'])
17+
def youtube_scraper_scroll():
18+
return yt_scraper_sroll_handler(request, jsonify)
19+
20+
if __name__ == "__main__":
21+
app.run(debug=True)

geckodriver.log

Lines changed: 1566 additions & 0 deletions
Large diffs are not rendered by default.

requirements.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
certifi==2022.6.15
2+
charset-normalizer==2.1.1
3+
click==8.0.3
4+
Flask==2.0.2
5+
idna==3.3
6+
itsdangerous==2.0.1
7+
Jinja2==3.0.2
8+
MarkupSafe==2.0.1
9+
requests==2.28.1
10+
urllib3==1.26.12
11+
Werkzeug==2.0.2
12+
python-dotenv==0.21.0
13+
flask_cors==3.0.3

results/gudang garam_scroll-10_20230122_231745.txt

Lines changed: 0 additions & 3913 deletions
This file was deleted.

youtube_search_scraper_scroll.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from selenium.webdriver.support.wait import WebDriverWait
55
from selenium.webdriver.support import expected_conditions as EC
66
import urllib.parse
7+
import os
78
import datetime
89

910
# options = webdriver.ChromeOptions()
@@ -36,7 +37,7 @@
3637
video_views = []
3738
video_published_times = []
3839

39-
max_scroll = 10
40+
max_scroll = 2
4041
file_name = f"{query}_scroll-{max_scroll}_{now.strftime('%Y%m%d_%H%M%S')}"
4142
# while True:
4243
while max_scroll > 0:
@@ -46,7 +47,7 @@
4647
for i, video_id in enumerate(video_ids):
4748
video_links.append(video_id.get_attribute("href"))
4849
video_titles.append(video_id.get_attribute("title"))
49-
50+
5051
video_infos = driver.find_elements(
5152
By.XPATH, "//span[@class='inline-metadata-item style-scope ytd-video-meta-block']")
5253
# for view_count in view_counts:
@@ -63,15 +64,17 @@
6364
"return document.documentElement.scrollHeight")
6465
driver.execute_script(
6566
f"window.scrollTo(0, {document_height_before + scroll_height});")
66-
67+
6768
# write to file
6869
with open(f"results/{file_name}.txt", "a") as f:
6970
for i, video_link in enumerate(video_links):
7071
if i < len(video_links) - 1:
71-
f.write(f"{video_link}{video_titles[i]}{video_views[i]}{video_published_times[i]}\n")
72+
f.write(
73+
f"{video_link}{video_titles[i]}{video_views[i]}{video_published_times[i]}\n")
7274
else:
73-
f.write(f"{video_link}{video_titles[i]}{video_views[i]}{video_published_times[i]}")
74-
75+
f.write(
76+
f"{video_link}{video_titles[i]}{video_views[i]}{video_published_times[i]}")
77+
7578
time.sleep(3)
7679
document_height_after = driver.execute_script(
7780
"return document.documentElement.scrollHeight")

yt_scraper_sroll/__init__.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
from selenium import webdriver
2+
import time
3+
from selenium.webdriver.common.by import By
4+
from selenium.webdriver.support.wait import WebDriverWait
5+
from selenium.webdriver.support import expected_conditions as EC
6+
import urllib.parse
7+
import datetime
8+
import os
9+
10+
# options = webdriver.ChromeOptions()
11+
# # options.add_argument("start-maximized")
12+
# options.add_experimental_option("excludeSwitches", ["enable-automation"])
13+
# options.add_experimental_option('useAutomationExtension', False)
14+
15+
options = webdriver.FirefoxOptions()
16+
options.add_argument("--headless")
17+
options.add_argument("--window-size=1920,1080")
18+
options.add_argument("--disable-gpu")
19+
options.add_argument("--disable-extensions")
20+
options.add_argument("--no-sandbox")
21+
options.add_argument("--disable-dev-shm-usage")
22+
options.add_argument("--disable-features=VizDisplayCompositor")
23+
options.add_argument("--disable-features=NetworkService")
24+
driver = webdriver.Firefox(options=options)
25+
BASE_DIR = os.path.join(os.path.dirname(__file__), '..')
26+
27+
28+
def handler(request, jsonify):
29+
body = request.get_json()
30+
31+
if body is None:
32+
return jsonify({'message': 'No body provided'}), 400
33+
34+
try:
35+
query = body['query']
36+
scroll = body['scroll']
37+
except Exception as e:
38+
return jsonify({'message': str(e) + " not provided"}), 400
39+
40+
query_url = urllib.parse.quote(query)
41+
print('Query URL: ', query_url)
42+
now = datetime.datetime.now()
43+
44+
driver.get(f"https://www.youtube.com/results?search_query={query_url}")
45+
46+
scroll_height = driver.execute_script("return window.innerHeight")
47+
video_links = []
48+
video_titles = []
49+
video_views = []
50+
video_published_times = []
51+
52+
max_scroll = scroll
53+
file_name = f"{query}_scroll-{max_scroll}_{now.strftime('%Y%m%d_%H%M%S')}"
54+
55+
# while True:
56+
while max_scroll > 0:
57+
print("Scroll:", max_scroll)
58+
max_scroll -= 1
59+
video_ids = driver.find_elements(By.XPATH, "//a[@id='video-title']")
60+
print('video_ids: ', video_ids)
61+
62+
for i, video_id in enumerate(video_ids):
63+
print("videoTitle", video_id.get_attribute("title"))
64+
print("videoID", video_id.get_attribute("href"))
65+
video_links.append(video_id.get_attribute("href"))
66+
video_titles.append(video_id.get_attribute("title"))
67+
68+
video_infos = driver.find_elements(
69+
By.XPATH, "//span[@class='inline-metadata-item style-scope ytd-video-meta-block']")
70+
# print('video_infos: ', video_infos)
71+
72+
for i, video_info in enumerate(video_infos):
73+
if "views" in video_info.text:
74+
view_count = video_info.text
75+
video_views.append(view_count)
76+
elif "ago" in video_info.text:
77+
published_time = video_info.text
78+
video_published_times.append(published_time)
79+
80+
document_height_before = driver.execute_script(
81+
"return document.documentElement.scrollHeight")
82+
driver.execute_script(
83+
f"window.scrollTo(0, {document_height_before + scroll_height});")
84+
85+
# write to file
86+
with open(f"{BASE_DIR}/results/{file_name}.txt", "a") as f:
87+
print(video_titles)
88+
for i, video_link in enumerate(video_links):
89+
print(video_link)
90+
# if i < len(video_links) - 1:
91+
# f.write(
92+
# f"{video_link} ‽ {video_titles[i]} ‽ {video_views[i]} ‽ {video_published_times[i]}\n")
93+
# else:
94+
# f.write(
95+
# f"{video_link} ‽ {video_titles[i]} ‽ {video_views[i]} ‽ {video_published_times[i]}")
96+
97+
time.sleep(1.5)
98+
document_height_after = driver.execute_script(
99+
"return document.documentElement.scrollHeight")
100+
if document_height_after == document_height_before:
101+
break
102+
103+
driver.quit()
104+
return jsonify({'message': 'success', "filename": file_name}), 200

0 commit comments

Comments
 (0)