From 598baf1921ab1310415bf04b6f938f2d97610ff8 Mon Sep 17 00:00:00 2001 From: yunruxian <68860614+yunruxian@users.noreply.github.com> Date: Wed, 3 Jan 2024 19:52:42 +0800 Subject: [PATCH 1/5] Update readme.md --- readme.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/readme.md b/readme.md index 16483bc..3f7e189 100644 --- a/readme.md +++ b/readme.md @@ -24,3 +24,7 @@ jinyangl 代码的逻辑:对于每个任务,首先模拟输入query进行高级检索,然后下载纯文本文件,移动到path中,再打开所有的结果页面,下载到path中。 至今为止这大概是第一个针对新版WoS的爬虫,希望可以抛砖引玉。代码逻辑应该比较清晰,如果有其他需求可以自行修改,也欢迎与我交流(~~挖坑~~)。 + +---- +更新网址: +https://webofscience.clarivate.cn/wos/alldb/advanced-search From 189a536841a40a584997eb65be9bbe401215b087 Mon Sep 17 00:00:00 2001 From: Yun <68860614+yunruxian@users.noreply.github.com> Date: Sun, 14 Jan 2024 14:14:05 +0800 Subject: [PATCH 2/5] Add files via upload --- common.py | 265 +++++++++++++ crawl.py | 737 ++++++++++++++++++++++++++----------- main.py | 124 ++++++- raw data file/example.xlsx | Bin 0 -> 9001 bytes readme.md | 40 +- 5 files changed, 908 insertions(+), 258 deletions(-) create mode 100644 common.py create mode 100644 raw data file/example.xlsx diff --git a/common.py b/common.py new file mode 100644 index 0000000..f21a46d --- /dev/null +++ b/common.py @@ -0,0 +1,265 @@ +import time +import os +import sys +from selenium.webdriver.common.by import By +from selenium.webdriver.support import wait, expected_conditions + + +def save_screenshot(driver, prefix, pic_path): + """Screenshot and save as a png""" + # 没有被调用,现在 + # paper_id + current_time + current_time = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time())) + driver.save_screenshot(f'{pic_path}{str(prefix)}_{current_time}.png') + + +# 用于任务记录 +class Check: + """ + 用于任务完成的标识和检查的类 + 一、该任务所有的工作是否已经完成 + 二、该任务系统提供的文件是否已经下载 + 三、该搜索结果的子页面是否处理结束 + 四、子页面:该搜索结果的子页面是否是否下载 + 五、子页面:该搜索结果的子页面信息是否已经提取结束 + """ + # 用于确定任务是否完成 + @staticmethod + def mark_task_finish_flag(path_school, query_task): + """Create a flag in the path to mark the task as completed.""" + with open(os.path.join(path_school, query_task, 'completed.flag'), 'w', encoding='utf-8') as f: + f.write('1') + return True + + @staticmethod + def check_task_finish_flag(path_school, query_task): + """Check if the flag in the path to check if task has been searched.""" + task_path = os.path.join(path_school, query_task) + return os.path.exists(task_path) and 'completed.flag' in os.listdir(str(task_path)) + + # 检查record_sys.txt是否已经下载 + @staticmethod + def check_task_downloaded_file(path_school, query_task): + task_path = os.path.join(path_school, query_task) + return os.path.exists(task_path) and 'record_sys.txt' in os.listdir(str(task_path)) + + # 用于确定子页面任务是否完成 + @staticmethod + def mark_subpage_done(path_school, query_task, current_url): + subpage_name = get_file_name(current_url) + with open( + os.path.join(path_school, query_task, f'{subpage_name}.flag'), + 'a+', encoding='utf-8') as file: + file.write(subpage_name) + file.write('\n') + + @staticmethod + def check_subpage_done(path_school, query_task, current_url): + subpage_name = get_file_name(current_url) + task_path = os.path.join(path_school, query_task) + return os.path.exists(task_path) and f'{subpage_name}.flag' in os.listdir(str(task_path)) + + # 用于搜索结果下载记录(每个任务下的每篇搜索文章) + @staticmethod + def check_subpage_downloaded(path_school, query_task, current_url): + subpage_name = get_file_name(current_url) + file_path = os.path.join(path_school, query_task) + if ( + os.path.exists(file_path) and f'{subpage_name}.html' in os.listdir(str(file_path)) + ) and ( + os.path.exists(file_path) and f'{subpage_name}.dat' in os.listdir(str(file_path)) + ): + return True + else: + return False + + # 用于搜索结果下载记录(每个任务下的每篇搜索文章) + @staticmethod + def mark_get_subpage_selected_info(path_school, query_task, sub_page_url): + with open(os.path.join(path_school, query_task, 'search_result_record.txt'), 'a+', encoding='utf-8') as file: + file.write(sub_page_url) + file.write('\n') + + @staticmethod + def check_get_subpage_selected_info(path_school, query_task, sub_page_url): + record_urls = set() + try: + with (open(os.path.join(path_school, query_task, 'search_result_record.txt'), 'r', encoding='utf-8') as file): + for record_url in file.readlines(): + record_url = record_url.strip("\n") + record_urls.add(record_url) + if sub_page_url in record_urls: + return True + else: + return False + except FileNotFoundError: + # 这是第一个处理的网页,所以'search_result_record.txt'还没建立起来 + return False + + @staticmethod + def check_total_handled(path_school, query_task, n_record): + record_urls = set() + with (open(os.path.join(path_school, query_task, 'search_result_record.txt'), 'r', encoding='utf-8') as file): + for record_url in file.readlines(): + record_url = record_url.strip("\n") + record_urls.add(record_url) + if len(record_urls) == n_record: + return True + else: + return False + + +def roll_down(driver, fold=40): + """ + Roll down to the bottom of the page to load all results + """ + # fold 倍数或者次数,就是下移500,重复40次 + for i_roll in range(1, fold + 1): + time.sleep(0.1) + driver.execute_script(f'window.scrollTo(0, {i_roll * 500});') + + +def show_more(driver): + """ + show more information hided + """ + # Show all authors et al. + element = wait.WebDriverWait(driver, 20).until( + expected_conditions.element_to_be_clickable( + ( + By.XPATH, + '//button[@id="HiddenSecTa-showMoreDataButton"]' + ))) + driver.execute_script("arguments[0].click();", element) + # 一些页面可以定位,一些页面死活定位不了 + # driver.find_element( + # By.XPATH, + # '/button[@id="HiddenSecTa-showMoreDataButton"]' + # ).click() + try: + driver.find_element(By.XPATH, '//*[text()="...More"]').click() + except: + pass + try: + driver.find_element(By.XPATH, '//*[text()=" ...more addresses"]').click() + except: + pass + + +def check_if_human(driver): + """ + 检查是否出现机器人检测 + """ + try: + driver.find_element( + By.XPATH, + '//*[contains(text(),"Please verify you are human to proceed."]' + ) + raise Exception("机器人检测出来了") + except: + try: + driver.find_element( + By.XPATH, + '//*[contains(text(),"我是人类"]' + ) + raise Exception("机器人检测出来了") + except: + pass + + +def get_file_name(current_url): + subpage_id = current_url.replace("https://webofscience.clarivate.cn/wos/alldb/full-record/", "") + pre, suf = subpage_id.split(":") + return pre + "_" + suf + + +def decorator(func): + """ + 装饰器,作用就是看是否有返回值,没有就使得返回值取值为 + """ + def wrapper(*args, **kwargs): + try: + result = func(*args, **kwargs) + except: + result = "" + return result + + return wrapper + + +@decorator +def get_element_text(driver, match_condition): + """ + 应用装饰器 + """ + info = driver.find_element( + By.XPATH, match_condition + ).text + return info + + +def close_pendo_windows(driver): + """Close guiding windows""" + # Cookies + try: + driver.find_element(By.XPATH, '//*[@id="onetrust-accept-btn-handler"]').click() + except: + pass + # "reminder me later" + try: + driver.find_element(By.XPATH, '//*[@id="pendo-close-guide-5600f670"]').click() + except: + pass + # "Step 1 of 2" + try: + driver.find_element(By.XPATH, '//*[@id="pendo-close-guide-30f847dd"]').click() + except: + pass + # "Got it" + try: + driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-primaryButton")]').click() + except: + pass + # "No thanks" + try: + driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-secondaryButton")]').click() + except: + pass + # What was it... I forgot... + try: + driver.find_element(By.XPATH, '//span[contains(@class, "_pendo-close-guide")').click() + except: + pass + # Overlay + try: + driver.find_element(By.XPATH, '//div[contains(@class, "cdk-overlay-container")]').click() + except: + pass + # Overlay dialog + try: + driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-close-guide")]').click() + except: + pass + try: + driver.find_element(By.XPATH, '//button[contains(text(), "全部允许")]').click() + except: + pass + + +# 显示进度 +def show_progress(total, current): + percent = 100*current/total + progress = "█" * int(percent/10) + sys.stdout.write(f"\r{percent:3.0f}%|{progress:<10}| {current}/{total}") + sys.stdout.flush() + + +# 遍历文件夹中的所有文件 +def list_all_files(path): + all_file_list = list() + for root, dirs, files in os.walk(path): + for file in files: + file_path = os.path.join(root, file) + print(file_path) + all_file_list.append(file_path) + return all_file_list diff --git a/crawl.py b/crawl.py index c818fe3..640e9de 100644 --- a/crawl.py +++ b/crawl.py @@ -1,18 +1,82 @@ # encoding: utf-8 -from logging import handlers import pathlib import shutil import time import logging import os import re -import tqdm +import json +import pandas as pd +from selenium.common import NoSuchElementException from selenium.webdriver.common.by import By from selenium.webdriver.support import wait, expected_conditions +import common +from selenium import webdriver + +check = common.Check + + +# 定义开始函数 +def start(school, query_task, default_download_path, others_information): + + # 设置浏览器options + options = webdriver.ChromeOptions() + options.add_argument("--disable-infobars") + options.add_argument("--window-size=1920,1080") + options.add_experimental_option('useAutomationExtension', False) + options.add_experimental_option('excludeSwitches', ['enable-automation']) + # 去掉浏览器上的正在受selenium控制 + options.add_argument("--disable-blink-features=AutomationControlled") + + # 看文件路径是否包涵文件名词 + if not default_download_path.endswith("/savedrecs.txt"): + default_download_path += "/savedrecs.txt" + + # driver.get("https://webofscience.clarivate.cn/") + # wait_for_login(driver) + # switch_language_to_eng(driver) + + path_school = os.path.join("downloads", school) + + # 检查路径是否存在,该任务是否已经处理过 + if path_school is not None and check.check_task_finish_flag(path_school, query_task): + return False + + # 实例化并运行程序 + driver = webdriver.Chrome(options=options) + # 页面放大 + driver.maximize_window() + + # Search query + if not search_query(driver, path_school, query_task): + # Stop if download failed for some reason + return False + + # 从页面下载部分 + # 判断这个搜索任务是否已经下载过WOS提供的下载文件了 + # 在多线程情况下,下面无法使用,因为无法保证某一线程获得其线程下载的文件 + # 并且由于线程较快,使得文件名变化 + # if not check.check_task_downloaded_file(path_school, query_task): + # # Download the search results using inner website function + # if not download_search_results(driver, default_download_path): + # return False + # # Deal with the downloaded search results file(.txt) + # if not deal_with_downloaded_file(driver, default_download_path, path_school, query_task): + # return False + + # 打开每个页面下载页面和爬取信息部分 + # Deal with records + if not deal_with_records(driver, path_school, query_task, others_information): + return False + + # Search completed + if path_school is not None: + check.mark_task_finish_flag(path_school, query_task) + driver.quit() def wait_for_login(driver): - '''Wait for the user to login if wos cannot be accessed directly.''' + """Wait for the user to login if wos cannot be accessed directly.""" try: driver.find_element(By.XPATH, '//div[contains(@class, "shibboleth-login-form")]') input('Login before going next...\n') @@ -20,99 +84,66 @@ def wait_for_login(driver): pass -def switch_language_to_Eng(driver): - '''Switch language from zh-cn to English.''' +def switch_language_to_eng(driver): + """Switch language from zh-cn to English.""" wait.WebDriverWait(driver, 10).until( expected_conditions.presence_of_element_located((By.XPATH, '//*[contains(@name, "search-main-box")]'))) - close_pendo_windows(driver) + # # 先搞掉一些提示窗口 + common.close_pendo_windows(driver) + try: driver.find_element(By.XPATH, '//*[normalize-space(text())="简体中文"]').click() driver.find_element(By.XPATH, '//button[@lang="en"]').click() except: - close_pendo_windows(driver) + common.close_pendo_windows(driver) driver.find_element(By.XPATH, '//*[normalize-space(text())="简体中文"]').click() driver.find_element(By.XPATH, '//button[@lang="en"]').click() -def close_pendo_windows(driver): - '''Close guiding windows''' - # Cookies - try: - driver.find_element(By.XPATH, '//*[@id="onetrust-accept-btn-handler"]').click() - except: - pass - # "Got it" - try: - driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-primaryButton")]').click() - except: - pass - # "No thanks" - try: - driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-secondaryButton")]').click() - except: - pass - # What was it... I forgot... - try: - driver.find_element(By.XPATH, '//span[contains(@class, "_pendo-close-guide")').click() - except: - pass - # Overlay - try: - driver.find_element(By.XPATH, '//div[contains(@class, "cdk-overlay-container")]').click() - except: - pass - # Overlay dialog - try: - driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-close-guide")]').click() - except: - pass - - -def mark_flag(path): - '''Create a flag in the path to mark the task as completed.''' - with open(os.path.join(path, 'completed.flag'), 'w') as f: - f.write('1') - - -def check_flag(path): - '''Check if the flag in the path to check if task has been searched.''' - return os.path.exists(path) and 'completed.flag' in os.listdir(path) - - -def search_query(driver, path, query): - '''Go to advanced search page, insert query into search frame and search the query.''' - if not path == None: +def search_query(driver, path, query_task): + """ + Go to advanced search page, insert query into search frame and search the query. + """ + # 看path是否是假的值,如果不是就继续,建立对应文件夹,不然用open with 时就会出错 + if path is not None: os.makedirs(path, exist_ok=True) - logging.info(path) + logging.info(f"{path}文件夹已经建立") + second_path = os.path.join(path, f'{query_task}') + os.makedirs(second_path, exist_ok=True) + logging.info(f"{path}-{query_task}文件夹已经建立") # Close extra windows if not len(driver.window_handles) == 1: handles = driver.window_handles - for i_handle in range(len(handles)-1, 0, -1): # traverse in reverse order + for i_handle in range(len(handles) - 1, 0, -1): # traverse in reverse order # Switch to the window and load the page driver.switch_to.window(handles[i_handle]) driver.close() driver.switch_to.window(handles[0]) - ## Search query - driver.get("https://www.webofscience.com/wos/alldb/advanced-search") + # Search query + driver.get("https://webofscience.clarivate.cn/wos/alldb/advanced-search") max_retry = 3 retry_times = 0 - while True: + while True: try: - close_pendo_windows(driver) + common.close_pendo_windows(driver) # Load the page wait.WebDriverWait(driver, 10).until( - expected_conditions.presence_of_element_located((By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Clear "]'))) + expected_conditions.presence_of_element_located( + (By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Clear "]') + ) + ) # Clear the field driver.find_element(By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Clear "]').click() # Insert the query - driver.find_element(By.XPATH, '//*[@id="advancedSearchInputArea"]').send_keys("{}".format(query)) + driver.find_element(By.XPATH, '//*[@id="advancedSearchInputArea"]').send_keys("{}".format(query_task)) # Click on the search button - driver.find_element(By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Search "]').click() + driver.find_element(By.XPATH, + '//span[contains(@class, "mat-button-wrapper") and text()=" Search "]').click() break except: retry_times += 1 @@ -122,18 +153,21 @@ def search_query(driver, path, query): else: # Retry logging.debug("Search retrying") + # Wait for the query page try: - wait.WebDriverWait(driver, 5).until( + # 根据文章链接判断是否加载成功 + wait.WebDriverWait(driver, 10).until( expected_conditions.presence_of_element_located((By.CLASS_NAME, 'title-link'))) - except: + except: try: # No results - driver.find_element(By.XPATH, '//*[text()="Your search found no results"]') - logging.warning(f'Your search found no results') + driver.find_element(By.XPATH, '//*[text()="No records were found to match your filters"]') + logging.warning('No records were found to match your filters') # Mark as completed - if not path == None: - mark_flag(path) + # 没有搜索结果时的任务标记 + if path is not None: + check.mark_task_finish_flag(path, query_task) return False except: # Search failed @@ -144,231 +178,512 @@ def search_query(driver, path, query): return True -def download_outbound(driver, default_download_path): - '''Export the search results as outbound. The file is downloaded to default path set for the system.''' +def download_search_results(driver, default_download_path): + """ + Export the search results using inner website function. The file is downloaded to default path set for the system. + """ max_retry = 3 retry_times = 0 - while True: - close_pendo_windows(driver) + while True: + time.sleep(0) + common.close_pendo_windows(driver) # Not support search for more than 1000 results yet - assert int(driver.find_element(By.XPATH, '//span[contains(@class, "end-page")]').text) < 1000, "Sorry, too many results!" + assert int(driver.find_element(By.XPATH, + '//span[contains(@class, "end-page")]').text) < 1000, "Sorry, too many results!" # File should not exist on default download folder - assert not os.path.exists(default_download_path), "File existed on default download folder!" - try: + # assert not os.path.exists(default_download_path), "File existed on default download folder!" + try: + common.close_pendo_windows(driver) # Click on "Export" - driver.find_element(By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Export "]').click() + driver.find_element(By.XPATH, + '//span[contains(@class, "mat-button-wrapper") and text()=" Export "]').click() # Click on "Plain text file" try: - driver.find_element(By.XPATH, '//button[contains(@class, "mat-menu-item") and text()=" Plain text file "]').click() + driver.find_element(By.XPATH, + '//button[contains(@class, "mat-menu-item") and text()=" Plain text file "]' + ).click() except: - driver.find_element(By.XPATH, '//button[contains(@class, "mat-menu-item") and @aria-label="Plain text file"]').click() + driver.find_element(By.XPATH, + '//button[contains(@class, "mat-menu-item") and @aria-label="Plain text file"]' + ).click() # Click on "Records from:" driver.find_element(By.XPATH, '//*[text()[contains(string(), "Records from:")]]').click() # Click on "Export" driver.find_element(By.XPATH, '//span[contains(@class, "ng-star-inserted") and text()="Export"]').click() # Wait for download to complete for retry_download in range(4): - time.sleep(2) + time.sleep(0.5) try: # If there is any "Internal error" wait.WebDriverWait(driver, 2).until( - expected_conditions.presence_of_element_located((By.XPATH, '//div[text()="Server encountered an internal error"]'))) + expected_conditions.presence_of_element_located( + (By.XPATH, '//div[text()="Server encountered an internal error"]'))) driver.find_element(By.XPATH, '//div[text()="Server encountered an internal error"]') - driver.find_element(By.XPATH, '//*[contains(@class, "ng-star-inserted") and text()="Export"]').click() + driver.find_element(By.XPATH, + '//*[contains(@class, "ng-star-inserted") and text()="Export"]').click() except: + # 下载成功了就退出循环 if os.path.exists(default_download_path): break # Download completed assert os.path.exists(default_download_path), "File not found!" return True except: - retry_times += 1 + retry_times += 1 if retry_times > max_retry: logging.error("Crawl outbound exceeded max retries") return False else: # Retry logging.debug("Crawl outbound retrying") - close_pendo_windows(driver) + common.close_pendo_windows(driver) # Click on "Cancel" try: - driver.find_element(By.XPATH, '//*[contains(@class, "mat-button-wrapper") and text()="Cancel "]').click() + driver.find_element(By.XPATH, + '//*[contains(@class, "mat-button-wrapper") and text()="Cancel "]').click() except: driver.refresh() - time.sleep(1) + time.sleep(0) wait.WebDriverWait(driver, 10).until( expected_conditions.presence_of_element_located((By.CLASS_NAME, 'title-link'))) continue -def process_outbound(driver, default_download_path, dst_path): - '''Process the outbound downloaded to the default path set for the system.''' - +def deal_with_downloaded_file(driver, default_download_path, dst_path, query_task): + """ + 将下载的结果转移至当前路径中 + 检查下载文件中的文献条目数目是否等于搜索出来的文献条目数据 + Process the outbound downloaded to the default path set for the system. + """ # Move the outbound to dest folder assert os.path.exists(default_download_path), "File not found!" + # 判断目标时路径还是文件 if pathlib.Path(dst_path).is_dir(): - dst_path = os.path.join(dst_path, 'record.txt') + dst_path = os.path.join(dst_path, f'{query_task}\\record_sys.txt') + # 该函数功能就是移动 shutil.move(default_download_path, dst_path) logging.debug(f'Outbound saved in {dst_path}') - # Load the downloaded outbound (for debug) - with open(dst_path, "r", encoding='utf-8') as f_outbound: - n_record_ref = len(re.findall("\nER\n", f_outbound.read())) - assert n_record_ref == int("".join(driver.find_element(By.XPATH, '//span[contains(@class, "brand-blue")]').text.split(","))), "Records num do not match outbound num" + # Load the downloaded file (for debug) + # 检查下载的文献条目数目是否等于网页上显示的搜索文献数目 + with open(dst_path, "r", encoding='utf-8') as f_file: + n_record_ref = len(re.findall("\nER\n", f_file.read())) + assert n_record_ref == int("".join( + driver.find_element(By.XPATH, '//span[contains(@class, "brand-blue")]').text.split( + ","))), "Records num do not match outbound num" return True -def download_record(driver, path, records_id): - '''Download the page to the path''' - # Load the page or throw exception - wait.WebDriverWait(driver, 10).until( - expected_conditions.presence_of_element_located((By.XPATH, '//h2[contains(@class, "title")]'))) - - # Download the record - with open(os.path.join(path, f'record-{records_id}.html'), 'w', encoding='utf-8') as file: - file.write(driver.page_source) - logging.debug(f'record #{records_id} saved in {path}') - - -def process_record(driver, path, records_id): - '''Parse a page to get certain statistics''' - # Show all authors and save raw data - try: - driver.find_element(By.XPATH, '//*[text()="...More"]').click() - except: - pass - with open(os.path.join(path, f'record-{records_id}.dat'), 'w', encoding='utf-8') as file: - file.write(driver.page_source) - logging.debug(f'record #{records_id} saved in {path}') - - -def roll_down(driver, fold = 40): - '''Roll down to the bottom of the page to load all results''' - for i_roll in range(1, fold+1): - time.sleep(0.1) - driver.execute_script(f'window.scrollTo(0, {i_roll * 500});') - - -def save_screenshot(driver, prefix, pic_path): - """Screenshot and save as a png""" - - # paper_id + current_time - current_time = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time())) - driver.save_screenshot(f'{pic_path}{str(prefix)}_{current_time}.png') - - -def process_windows(driver, path, records_id): - '''Process all subpages''' - handles = driver.window_handles - has_error = False - for i_handle in range(len(driver.window_handles)-1, 0, -1): # traverse in reverse order - # Switch to the window and load the page - driver.switch_to.window(handles[i_handle]) - close_pendo_windows(driver) - try: - download_record(driver, path, records_id) - process_record(driver, path, records_id) - except: - logging.error("Record downloading failed!") - has_error = True - records_id += 1 - driver.close() - driver.switch_to.window(handles[0]) - return len(handles) - 1 if not has_error else -1 - +def deal_with_records(driver, path, query_task, others_information): + """ + Open records as new subpages, download or parse subpages according to the setting. + add a function: 将 -def process_records(driver, path): - '''Open records as new subpages, download or parse subpages according to the setting.''' + open each search result(record) as a new subpage + deal with subpages(using function:process_windows();用windows是因为只处理已经打开的subpage,也就相当于桌,所以需要重复调用): + download all subpages + get_subpage_inf_wanted(整理页面内所有的信息) + """ # init + # 计算有多少个子网页需要打开 n_record = int(driver.find_element(By.XPATH, '//span[contains(@class, "brand-blue")]').text) n_page = (n_record + 50 - 1) // 50 assert n_page < 2000, "Too many pages" logging.debug(f'{n_record} records found, divided into {n_page} pages') - + + # 搜索结果计数 records_id = 0 + # 创建一个集合 url_set = set() + for i_page in range(n_page): + # 当前浏览器有多少个窗口?等于1,否则出错 assert len(driver.window_handles) == 1, "Unexpected windows" - roll_down(driver) - + + # 先到低,使得所有的summary-record-title-link都展现出来,以便获取元素集 + common.roll_down(driver) + # Open every record in a new window windows_count = 0 for record in driver.find_elements(By.XPATH, '//a[contains(@data-ta, "summary-record-title-link")]'): + # 判断该搜索结果(记录)是否已经打开过 if record.get_attribute("href") in url_set: - # coz some records have more than 1 href link - continue + # coz some records have more than 1 href link + continue else: - url_set.add(record.get_attribute("href")) - time.sleep(0.5) + url_set.add(record.get_attribute("href")) + # 新加的一个判断,功能和上面一样的 + # 看这个页面搞完了没 + current_url = record.get_attribute("href") + if check.check_subpage_done(path, query_task, current_url): + continue + # open one pages driver.execute_script(f'window.open(\"{record.get_attribute("href")}\");') + time.sleep(1) windows_count += 1 + # 一个条件是是否到10个了。另一个条件是否是5的倍数。 + # 也就是说,只有10,15,20等才会处理 + # 先要多线程的话,好像只能这么写 if windows_count >= 10 and not windows_count % 5: # Save records and close windows - increment = process_windows(driver, path, records_id) + # 返回值是处理了几个网页 + increment = process_windows(driver, path, query_task, others_information) if increment != -1: records_id += increment else: return False - time.sleep(5) - + time.sleep(0) + # 这应该是最后一页,单着的几页。比如53个搜索结果的剩余3个。 # Save records and close windows - increment = process_windows(driver, path, records_id) + increment = process_windows(driver, path, query_task, others_information) if increment != -1: records_id += increment else: return False # Go to the next page - if i_page + 1 < n_page: - driver.find_element(By.XPATH, '//mat-icon[contains(@svgicon, "arrowRight")]').click() - return True - + if i_page + 1 < n_page: + element = wait.WebDriverWait(driver, 20).until( + expected_conditions.element_to_be_clickable( + ( + By.XPATH, + '//button[contains(@data-ta, "next-page-button")]' + ))) + driver.execute_script("arguments[0].click();", element) + # 根据下载的数据数目是否与查询到的数目对应,确定返回值 + if check.check_total_handled(path, query_task, n_record): + return True + else: + logging.error("Record handled num does equate the num searched!") + return False + + +def process_windows(driver, path, query_task, others_information): + """ + Process all subpages + records_id: the number of the search result + path: path of the task + """ + handles = driver.window_handles + has_error = False + for i_handle in range(len(driver.window_handles) - 1, 0, -1): # traverse in reverse order + # Switch to the window and load the page + driver.switch_to.window(handles[i_handle]) # 先打开最后一个 + common.close_pendo_windows(driver) + current_url = driver.current_url + time.sleep(1) + # 展开页面隐藏内容 + common.show_more(driver) + # 下载页面 + if not check.check_subpage_downloaded(path, query_task, current_url): + if not download_subpage(driver, path, query_task): + logging.error(f"Page({current_url}) download mistake!") + has_error = True + driver.close() + continue + # 获得整理后的信息 + if not check.check_get_subpage_selected_info(path, query_task, current_url): + if not get_subpage_inf_wanted(driver, path, query_task, others_information): + logging.error(f"Page({current_url}) information get mistake!") + has_error = True + driver.close() + continue + driver.close() + # 标一下,这个页面搞完了 + check.mark_subpage_done(path, query_task, current_url) -def start_session(driver, task_list, default_download_path): - ''' - Start the search of all tasks. - driver: the handle of a selenium.webdriver object - task_list: the zip of save paths and advanced query strings - default_download_path: the default path set for the system, for example, C://Downloads/ - ''' - - # Init - os.makedirs('logs', exist_ok=True) - logging.basicConfig(level=logging.INFO, - filename=os.getcwd() + '/logs/log' + time.strftime('%Y%m%d%H%M', - time.localtime(time.time())) + '.log', - filemode="w", - format="%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s" - ) + driver.switch_to.window(handles[0]) + # 如果无误的话,返回len(handles)-1,也就是打开了多少个窗口 否则返回-1 + return len(handles) - 1 if not has_error else -1 - if not default_download_path.endswith("/savedrecs.txt"): - default_download_path += "/savedrecs.txt" - driver.get("https://www.webofscience.com/") - wait_for_login(driver) - # switch_language_to_Eng(driver) - # Start Query - for path, query in tqdm.tqdm(task_list): - if not path == None and check_flag(path): continue +def download_subpage(driver, path, query_task): + """ + Download the page to the path + """ + try: + # Load the page or throw exception + wait.WebDriverWait(driver, 10).until( + expected_conditions.presence_of_element_located((By.XPATH, '//h2[contains(@class, "title")]'))) + + current_url = driver.current_url + subpage_name = common.get_file_name(current_url) + + # Download the record + with open(os.path.join(path, query_task, f"{subpage_name}.html"), 'w', encoding='utf-8') as file: + file.write(driver.page_source) + logging.debug(f'record # {subpage_name} saved in {path}/{query_task}') + + # Download the record in dat + with open(os.path.join(path, query_task, f"{subpage_name}.dat"), 'w', encoding='utf-8') as file: + file.write(driver.page_source) + logging.debug(f'record # {subpage_name} saved in {path}/{query_task}') + return True + except: + return False - # Search query - if not search_query(driver, path, query): - # Stop if download failed for some reason - continue - # Download the outbound - if not download_outbound(driver, default_download_path): +def get_subpage_inf_wanted(driver, path, query_task, others_information): + """ + 爬取信息以下信息:论文名词,期刊,发表时间,作者,单位,类型, + """ + try: + # 网页 + current_url = driver.current_url + # 论文名词 + title_en = driver.find_element( + By.XPATH, '//*[@id="FullRTa-fullRecordtitle-0" and @lang="en"]' + ).text.capitalize() + title_zh_cn = common.get_element_text( + driver, '//*[@id="FullRTa-fullRecordtitle-0" and @lang="zh-cn"]' + ) + # 期刊名 无连接型 + # nolink和link不一样,一个是span,一个是a + try: + try: + journal_en = driver.find_element( + By.XPATH, + '//span[contains(@class, "summary-source-title") and @lang="en"]' + ).text.capitalize() + journal_zh_cn = common.get_element_text( + driver, + '//span[contains(@class, "summary-source-title") and @lang="zh-cn"]' + ) + except: + # 期刊名 有连接 + journal_en = driver.find_element( + By.XPATH, + '//a[contains(@class, "summary-source-title-link") and @lang="en"]' + ).text.capitalize() + journal_zh_cn = common.get_element_text( + driver, + '//a[contains(@class, "summary-source-title-link") and @lang="zh-cn"]' + ) + except: + journal_en = "该文没有期刊名称,请注意文章类型(Document Type)" + journal_zh_cn = "" + # 作者信息_英文 + authors_info_en = get_authors_info(driver, language="en") + # 中文相关信息 + authors_info_zh_cn = get_authors_info(driver, language="zh_cn") + # 出版日期 + published_date = common.get_element_text(driver, '//span[@name="pubdate"]') + # 检索日期 + indexed_date = common.get_element_text(driver, '//span[@name="indexedDate"]') + # 文章类型 + document_type = common.get_element_text(driver, '//span[@id="FullRTa-doctype-0"]') + # volume + volume = common.get_element_text(driver, '//span[@id="FullRTa-volume"]') + # Issue + issue = common.get_element_text(driver,'//span[@id="FullRTa-issue"]') + # pagenum + pagenum = common.get_element_text(driver, '//span[@id="FullRTa-pageNo"]') + # doi + doi = common.get_element_text(driver, '//span[@id="FullRTa-DOI"]') + # 摘要——英文 + abstract_en = common.get_element_text( + driver, '//div[@id="FullRTa-abstract-basic" and @lang = "en"]/p') + # 摘要——中文 + abstract_zh_cn = common.get_element_text( + driver, '//div[@id="FullRTa-abstract-basic" and @lang = "zh-cn"]/p') + # language + language = common.get_element_text(driver, '//span[@id="HiddenSecTa-language-0"]') + # cited num + try: + cited_num = driver.find_element( + By.XPATH, '//*[contains(@id, "FullRRPTa-wos-citation-network-times-cited-count-link")]' + ).text + except: + cited_num = 0 + # corresponding author + corresponding_author_set = set() + for i in range(5): + try: + corresponding_author_elements = wait.WebDriverWait(driver, 10).until( + expected_conditions.presence_of_all_elements_located( + (By.XPATH, f'//div[@id="FRAiinTa-RepAddrTitle-{i}"]/div/div') + ) + ) + for corresponding_author_element in corresponding_author_elements: + corresponding_author = corresponding_author_element.find_element( + By.XPATH, + './/span[@class="value"]' + ).text + corresponding_author_set.add(corresponding_author) + except: + try: + corresponding_author = wait.WebDriverWait(driver, 10).until( + expected_conditions.presence_of_element_located( + (By.XPATH, f'//div[@id="FRAiinTa-RepAddrTitle-{i}"]/div/div/span[@class="value"]') + ) + ).text + corresponding_author_set.add(corresponding_author) + except: + break + corresponding_author_list = list(corresponding_author_set) + + subpage_inf = dict( + school_id=str(others_information["学校ID"]), + school=path, + teacher_id=str(others_information["教师ID"]), + teacher_name=others_information["姓名"], + current_url=current_url, + title=dict(title_en=title_en, title_zh_cn=title_zh_cn), + journal=dict(journal_en=journal_en, journal_zh_cn=journal_zh_cn), + authors_info=dict(authors_info_en=authors_info_en, authors_info_zh_cn=authors_info_zh_cn), + corresponding_author_list=corresponding_author_list, + cited_num=cited_num, + published_date=published_date, + indexed_date=indexed_date, + volume=volume, + issue=issue, + pagenum=pagenum, + doi=doi, + document_type=document_type, + abstract=dict(abstract_en=abstract_en, abstract_zh_cn=abstract_zh_cn), + language=language, + time_data=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), + ) + + # 写入json + with open(os.path.join(path, query_task, 'search_results_information_got.txt'), 'a', encoding='utf-8') as file: + json.dump(subpage_inf, file, indent=4, ensure_ascii=False, allow_nan=True) + file.write("\n") + logging.debug(f'record #{current_url} dict saved in {path}-{query_task}') + file.close(), + + # 写入json + with open(os.path.join(path, query_task, 'search_results_information_got.json'), 'a', encoding='utf-8') as file: + json.dump(subpage_inf, file, ensure_ascii=False, allow_nan=True) + file.write("\n") + logging.debug(f'record #{current_url} dict saved in {path}-{query_task}') + file.close() + + # mark一下,这个网页爬下来了 + check.mark_get_subpage_selected_info(path, query_task, current_url) + return True + except: + return False + + +def get_authors_info(driver, language="en"): + """ + 获得作者信息部分太长了,且可重复,所以单独设为一个函数 + """ + authors_info = list() + + author_elements = driver.find_elements( + By.XPATH, + '//div[@id="SumAuthTa-MainDiv-author-{lang}"]/span/span[@class="value ng-star-inserted"]'.format(lang=language) + ) + # find_elements没找到不会报错,只会返回空列表 + # 没有的话就返回空字典,对应的情况是:全英文,可能没有中文 + if not author_elements: + author_info = dict( + author_order="", + author_name_dis="", + author_name_std="", + author_addresses=[], + author_email="", + ) + authors_info.append(author_info) + return authors_info + # 有的话就继续 + author_order = 0 + for author_element in author_elements: + # 展示名 + author_name_dis = author_element.find_element( + By.XPATH, + './/a[@id="SumAuthTa-DisplayName-author-{lang}-{order}"]'.format(order=author_order, lang=language) + ).text + # 标准名 + author_name_std = common.get_element_text( + author_element, + './/span[@id="SumAuthTa-FrAuthStandard-author-{lang}-{order}"]/span'.format( + order=author_order, lang=language + ) + ) + author_addresses = list() + + # 作者对应地址 + try: + address_elements = author_element.find_elements( + By.XPATH, + './/a[contains(@class,"address_link")]' + ) + except NoSuchElementException: + address_elements = [author_order] + + for address_element in address_elements: + # 得到address编号 + # [1]是个list + try: + address_order = address_element[0] + except TypeError: + address_order = address_element.text.strip() + address_order = str(address_order).split("[")[1].split("]")[0] + # 通过编号得到对应的地址 + try: + address = driver.find_element( + By.XPATH, + '//*[@id="address_{}"]/span[2]'.format(address_order) + ).text + author_addresses.append(address) + except: + author_addresses.append("该文未列出地址信息!") + # 邮箱 + try: + author_email = driver.find_element( + By.XPATH, + '//a[@id="FRAiinTa-AuthRepEmailAddr-{}"]'.format(author_order) + ).text + except: + author_email = "注意:该文章作者邮箱与作者并不对应!" + author_order += 1 + author_info = dict( + author_order=author_order, + author_name_dis=author_name_dis, + author_name_std=author_name_std, + author_addresses=author_addresses, + author_email=author_email, + ) + authors_info.append(author_info) + return authors_info + + +def json_to_excel(file_path): + """ + 将下载的json数据转成excel + """ + # 从json文件中加载数据 + # 读取JSON数据并解析为Python数据结构 + with open(os.path.join(file_path, 'search_results_information_got.json'), 'r', encoding='utf-8') as file: + rows = [] + for line in file: + data = json.loads(line) + rows.append(data) + + # 将Python数据结构转换为DataFrame + df = pd.json_normalize(rows) + + # 将DataFrame保存为Excel文件 + df.to_excel(os.path.join(file_path, 'search_results_information_got.xlsx'), index=False) + + +def combine_excel(path): + data_list = [] + # os.listdir(".")返回目录中的文件名列表 + for file in common.list_all_files(path): + # 判断文件名以".xlsx"结尾 + if file.endswith("search_results_information_got.xlsx"): + # pd.read_excel(filename)读取Excel文件,返回一个DataFrame对象 + # 列表名.append将DataFrame写入列表 + data_list.append(pd.read_excel(file)) + else: continue - # Deal with the outbound - if not process_outbound(driver, default_download_path, path): - continue + # concat合并Pandas数据 + data_all = pd.concat(data_list) + # 将 DataFrame 保存为 excel 文件 + data_all.to_excel("all_results.xlsx", index=False) - # Deal with records - if not process_records(driver, path): - continue - # Search completed - if not path == None: - mark_flag(path) - - driver.quit() diff --git a/main.py b/main.py index c057073..abee9a8 100644 --- a/main.py +++ b/main.py @@ -1,20 +1,106 @@ # encoding: utf-8 -from crawl import * -from selenium import webdriver - -if __name__ == '__main__': - ################ Set up parameters here ##################### - default_download_path = "C://Users/bigwh/Downloads" + "/savedrecs.txt" - # The first string should be the path where your file is downloaded to by default. - # Most likely, it should be like: "C://Users/usr_nm/Downloads" - task_list = [ # folder_name, query - ["results/search_1", "TI=(pFind) AND PY=(2016-2022)"], - ["results/search_2", "TI=(Attention is All you Need)"] - ] - # These are the tasks to be searched - driver = webdriver.Chrome( - executable_path='C://Program Files//Google//Chrome//Application//chromedriver.exe' - # This is the path where you place your chromedriver - ) - ############################################################# - start_session(driver, task_list, default_download_path) +import common +import crawl +import pandas as pd +import concurrent.futures +from concurrent.futures import ThreadPoolExecutor +import tqdm +import logging +import os +import time + +################ Set up parameters here ##################### +# 浏览器默认下载路径设置 +# The first string should be the path where your file is downloaded to by default. +# Most likely, it should be like: "C://Users/usr_nm/Downloads" +default_download_path = "C:/Users/yunruxian/Downloads" + "/savedrecs.txt" + +# 搜索任务所在excel文件路径设置 +task_path = "raw data file/teachers‘list(20240106).xls" + +# 读取搜索任务文件 +df = pd.read_excel( + task_path, + sheet_name="Sheet1", + header=0, + keep_default_na=False, +) + +# 获得搜索任务列表 +task_list = [] +for i in range(0, len(df)): + school_id = df.iloc[i]["学校ID"] + school = df.iloc[i]["学校"] + teacher_id = df.iloc[i]["教师ID"] + teacher_name_cn = df.iloc[i]["姓名"] + teacher_name_en = df.iloc[i]["name"] + address_en = df.iloc[i]["address"] + address_en_plus = df.iloc[i]["address_plus"] + others_information = {"学校ID": school_id, "教师ID": teacher_id, "姓名": teacher_name_cn} + task_list.append([f"{school}", F"AU='{teacher_name_en}' AND AD='{address_en}'", others_information]) + if address_en_plus: + task_list.append([f"{school}", F"AU='{teacher_name_en}' AND AD='{address_en_plus}'", others_information]) + +""" +Start the search of all tasks. +driver: the handle of a selenium.webdriver object +task_list: the zip of save paths and advanced query strings +default_download_path: the default path set for the system, for example, C://Downloads/ +""" + +# Init +"""设置logging""" +os.makedirs('logs', exist_ok=True) +logging.basicConfig(level=logging.INFO, + filename=os.getcwd() + '/logs/log' + time.strftime('%Y%m%d%H%M', + time.localtime(time.time())) + '.log', + filemode="w", + format="%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s" + ) + +if __name__ == '__main__': + threader = ThreadPoolExecutor(max_workers=5) + remaining_tasks = [] + # Start Query + # tqdm.tqdm(task_list)生成一个由迭代对象组成的进度条 + for school_folder, query_task, others_information in tqdm.tqdm(task_list): + # 传统爬取命令 + # crawl.start(school_folder, query_task, default_download_path, others_information) + # 多线程命令 + future = threader.submit(crawl.start, school_folder, query_task, default_download_path, others_information) + remaining_tasks.append(future) + + # 线程计数 + while True: + for future in remaining_tasks: + if future.done(): + remaining_tasks.remove(future) + common.show_progress(4118, (4118 - len(remaining_tasks))) + # 所有任务完成时就退出循环 + if len(remaining_tasks) == 0: + break + + # 等待所有的线程对象完成 + # wait的参数就是一个列表,运行到这里时,列表里已经没有对象了,所以下面这条命令重复 + # concurrent.futures.wait(remaining_tasks) + threader.shutdown() + + # 检查是否全部完成 + for school_folder, query_task, others_information in tqdm.tqdm(task_list): + path_school = os.path.join("downloads", school_folder) + if not common.Check.mark_task_finish_flag(path_school, query_task): + print(school_folder, query_task) + + # trans form json to excel + for school_folder, query_task, others_information in tqdm.tqdm(task_list): + if os.path.join( + "downloads", school_folder, query_task, 'search_results_information_got.xlsx' + ) not in os.listdir( + os.path.join("downloads", school_folder, query_task) + ): + try: + crawl.json_to_excel(os.path.join("downloads", school_folder, query_task)) + except: + continue + + crawl.combine_excel("downloads") diff --git a/raw data file/example.xlsx b/raw data file/example.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..84172c08a491af3cdd18afb5ad69b682a243b0ea GIT binary patch literal 9001 zcmeHtgKe8q2!5dfT{qo9p_y+jyDs__{dL<)fpry#}Bnpa0+SU%Ucksl)2sd?5J~rAyg0 zZk2@^DJ-Gg@O~l=Es2hvl)e%x<2(n4o4nWykjxtrN0Dlxv4sHs{So^bSC{(mn7$Tm zy!fzz4m~p}5#HYZ9p-j2ieyhCL&R$cmDF7dbMxo6Ie<)$hHh;^xfN-ZGD>?AQi;%S zFX#IoftUN5@Jfu{-yeW=epP?81im*^JPGvsKi!+#nEMqmNt<)YJ!wC#c_uvMtzA+ob7=of&T4m zFX9h1&zprw?9I82^Q6M)!zX3|nojYxq~-jsV?jNVBn1zL_qym7FPK+|se6{Gtef)^ z60OzA8CK@S+e(m(c?!(~F598C4%?r51YK;85iS!n)@LOjL{QtkfV~ z%BN_GA+5oeL$F1$=UW5x$E$*6@k9_&#(JOfsEl)uCzu>8p6Lo6Wh*^k==9O_ah9Tr z9~-nafupqX{TtQ6Wk!X`eYtAT7>_^C{ppPFrz*-!zIhp3Y2LXo!#=Nss@qQ zTPb5XAv5WPoA{D3_aE*|WRVPbS=!B21P(aUADvO^>Db?QsItuSe8AvmZtdLhQ!b~S z_$rWFJ$FExL6GQ_e@wBT`SnqVQLW(q@JqiQ5$x{LfxW@-c-r?%NQU|+NW7p0&*{+s zfCD6i2$3t}>&)xp=ILbV=H~Q^x61TJT&MUzZH)T_=?I5a=VX`s!uJ=fai_&>pNCb_ zMD~RXV`-d=T1Zx3Gt1J;B@N>j>eYBZloI)pC zHOaE(yG4&2ix-c&_T}N=r@1~QDh^L0l}_!zQNgwk0T*9JRZ1o!Q;V8MF9$;U|F>Y1*y6r2HC6>^v^|72*$H~C6HuO-r%ILtgYD`%_L8?h+` z;VecuZ~j^bA1~QqaVOC$QroqzDM7L9RCOIS)_hjNH75zy9+$^*WUINGbHiZTY{(iyXO=xT+XSi{ErMU4%!WB} zYy$lsL&7hv27{C)=*6F=hVdQ5$1O$YCU#a2=*qk>?_XGNsXNOwa=d8~>D*fW!=~p3^?t20}q5gh)*OcOPZiYN|bapjO=LXg;4DC<%DZo0nlnZ<`Qn zu$CRhLN5?}xJ<>-Y-;dDl^4w|$`3Kp4?PhgSVbc~>g0SMM}+QA?2Hfvx{drCLBq1# zEx!}42qY%j+1f7M#U;r0B5smM9^lG@-1~9L#!L}iP{<&6*fbF(=s3#vgx}HYGtCe( z0uR@&g`P*~?39PsHZ;&>v)g{ZX_AE_^-hV9cwm$_UG#t=Z3ug^*Z2Ia7Cpp@x6H*5dpqoaYxodhyEfQA zy^OTf5NYlFz(=xiM)KukIu%Fg|<(y}j?P0E8T6Gw1# zt&an~KUtAk%C%Q^rkqwr8|CxRqj)n31XCqv8G9F+1*QVd)6$cwz^jAx2ZrhdnRz!-?Mn)&4nBJO#1;YXPclBrY{h z(ndB=XgsLy(^%A?`e)^dBjYf?lodNw{(-Z?LtfhxEKfaOT}pJT!$;AKqXYu(s5-HZ zogmdK;4j<#HehO+vFlT03g19f9m@alJO)c!hX~IU$e?tkQW|RCa`7OIWUMuU@H(`c z2G)OPAUdIqb{Og$e|d1$+O!kSHj8Jz>sO@@8(6xObAY*MwA)B}U->Nn4-zrtmrR#~ z>J}s@cCK`l?x@S(fxg%kicf5HAZ6qfQca{0R86808cx*97H1I17N;*23Q+iBe;mF; z^99};@nv7l`=f&QWo4!y+$A>WI_B{v14He5w6_D-V7kr9KxHuHs8r%{A%;@UB;c?G zttMHhCb^}?Io8IIhd%?zo(@bD)Ngx>1;j_+d^`==u3_(15N&9N^~9)UOCZT6;MzG(-+T!^M#RvAKW zmN?dg-3SC^l|l7l8e5NIOi48`J&)OUl-;8~8aPF1iV!f%wP8FrED6|i%sXz$%l(mQ zB%B%&z~J+gB2O*0yX$;1sh-uKyB^m1<|I4QqY2yr$T9at_n+6}B=AM_;i0xg{vO{#E47bn|-TGu5L)#b0YIWIhmYQc{uPh>Djhqnq!?*p5=<(~zBK)%*kpbD2al@O+i-Kbm@F zyx49g9}o#95p@=7d2`OWgY$IL!SpC=q(4VJNOG?m-q((*S^b0KNFqf+^dn!G+aQr{>R&0cW6buM^4=93lMN1zHtv~49EzZK&$L(<%|IOetZGeM_8 z+|0dlmcjm&lxJLW{jJz$ruYZ$o78;Z6y@`2Kwc5ZfehH1XQFD=G^RQHfc2_yd@za zkhEt+=SVNvHHGyJ#Rw;L*5rkz%uy&UJs75xS5_1Ig328wE6-{uw-e&!`k^VM)3@&J zGO9~;(N$&z5pfo_r{hn*2=d9KASh)oo1ww|6u~dm=6gD9T5Ah7T^N$KKd2ZG^n`4S zS#V5rs?9bNEcny1t<)jZB#rbe<_yftEX23CQP3j}v=X-%KbZ;Dw5#w!Oj@JK-K{{N zqpQ!Q#Dk@gm?M`5A73xExZg;Rzzb~r$-13t#O;ETcQlfBuMx}vGcAEfGNz6K1JEW3 z{n8`rb;}?HHgPa#TtcH!rB`NBp{%_w6?R?Dg7XsTT@zAcG&GU1#2OENmNq5^iP&*e z(j%d|quifeN4cW7eHP`F+0!iK=6mH(yy3M>wV&dSbvJ7W8xtfR8%X5oCy&uh{g;TFtWvhd!R47NjL=V(Kl5ppxbWGXpw{m?Js=yt7 zv*_Fp;CRR9?pa)~p^YY63=g8wKEw_jkX@Um+9Y|5D~gSxRmidGKaIf3y`RcNqfZc` zgD@rD*7Kv2!Asn|_gX z|8Rj~(gsQ>9Sa=E#`~J;e(#|8`*<$Vz3v4pL&Fdfmk*u7atenFQ?nZ%z5UeCD4t{Tem$P*2#=APqkbdZX9h4l24?%BN@3O1 zF1V`2pw2VgppsK>r8I%&`~*ul7=3j-I%S!e#N-`Oo3J$&Y0%l62M^U=oET{@CAcEj zM&SXRn0Mn`OFV|djFVjVG76z*B;BC3i6LDejOA$7t{;L!93PWnlo(=9(FKN>lDsn3 zz4*eKVI4FCb?Hjx(0Qd>64!XXrd^7-URwo$Ub0#Q&AQeGSiD=@Am9vcpDC#=sM6nz z*7_j1qbOY}9=P?u4rj-QDv6fBD6NWAo<#DH>FQC-#NqZ)#qv%)M(~M4^|P>~^#iRQ z)UHzB-5cEhL|UN^nH^(DI%`4_7yiFU%gfu(*~aTvruY`QXP=b%y=O0NZMjyaP9A{D zXB(C1GQA~@-R?7EErn~;wqO&)P?6Q*f8SKG6F9WQeJ!7t6GEvT6r4c$63^-%B3+>;Wj?<0@$Yet* z%S<(bmLZ%RG558cIHYyhxnm+r&CYGh+1Z~5Fg4NwI|s-jlouEt%I{0kg>U4H%yXt1 z5D$|+6KHX&;v?!kv}8|X!6ne$c2~mDjb@~IV}y7qIDr|A&RNS}Qo|a$U0PxZ3{vu0 z)dj2q`R@%8j%1}@^RjukMBkl)Xk1X$v=7Wj4(!g zUhyqrk#Po<{rXA3#RZJa{xbn8x9~G#mKzyzzTxx4@D9C>W^A`)F9eWZYa-6Mv9(RV z@YFq6R<$0?NBnZyxL&V5Olf~5aFc`NMJ}NcOcl)q&%Ab=tM}EYT_wxUpJE4noz6S> zn1aVF`_Ay*?hkW;c^ctt_bwJ-0%NoMBSxQyGM9^2s^Y!LoE5LP=u|xr6BBO!5c`uR zjeM=W2Lx!mwCHa{sFSKi?%-SUCUo0NP>zm%6N7ssylQ-oDv1Yc{V~w5GmQTIA>661 zQqvqqK73m;eD5S}kMQRH9PRHc@pe&{pQ;h}6>`_Cf!s1d{_uQz3r`zs9dAzuSG(T; z2ux8^-IjtxoseA;c9%h>bmhUy45vU+5wa&MDpA(X7|AKwSut?*aVH=wE{0T66a3^2 zTQE3j^_6nru{rs!Lb7khy5E@J`bSI!8Z)SKt(^F%$!B*B9|A_4C?|e4dtaz+z7vR% z2P?A&8s_L5=_yv$aHVyhj4cmXVWAqcnwan2olmf`=$a%`#?_c7D`(;lj3g6RdFZOe zL2bM_@*M*b&DSyD9t5Iz!Fj`eC?K znA(lc`xPpFfadMEinR?+I3^FZ8bO?GeNA>1rgyDHK+!A|chwR+uW|Eu=Yv(^OPKv? zXiZIe^_9xc9wm)^H;7@Z6}j0B%yn*vd!c*T?f;>E=@g#O{cdgOsaSI?2d}clyrq3a zmkEb*-W=X%uBvzHf-VO33-0vy7v|Zvo1TmM(F&FnOA$kG>_;QJ%#YjAEi|uvan?Fr zqZ-6`q7lX`+x`=e|8LfCa~MLsp@DlsI$e`KY>d<{Bh{f`Fqpyo5t zKpNB<8P34Sl8LpOm8PegyBDvOo2Shmt+W4CN{~wzn5?DR%?F8G2HwaH2ISZ-_Tjl; zas+tN8E6IK4la^9PbOMJuTBbgABjR}y~YMJym*vnic0X>It&a;@sijSP}t0Hi;pS% zxrJA7fM4!18yIp9XyTHV|AghFIt@9YjTL@*oQqE7n_QE+jYpF^39%X`)VxdO7k#i= zAxz?RZGUVaa6Nb&YUlB>hvXhhrd19)!H@Ko+ox)<=2w1R4f?Dxv&vF7MaacJmFHf%T=UQz~~xJWdjA~g2hrWr3V(8%Pz3XH$$l=oA;b0;2u$nLIT z#6JHsGN7TBr((*P;>qds70)-m_oLuz=FuM(4`ujySDBfy)dnJbQPzUs+aXCI$iVT> zUEl-LLvS|I3qi=cpuc*dg}eLzoDk`Ke;m2VovyH74HMuwK@|LXt}9!V)FZ98oc^;_ z0A+uDn+K#Es^LA9gLyU?G#ESh(0Q15$DM-O=Y{Gp9XF&i2q-wV>u zC260{vb_~_ZLH)dSee+qz@R>u6qEc`CGXCh%4*>@fYEOlj}YQ+g`ZB0IFex*#KnWk z#&H}}&ZajuQY!f_K2ARd(#D5xq`yWKbiVuqq%E>6PK&%xF|0C&9xh4NBlO^ggf|w-E5NcoIeiz_lsNiG+6-I9qcu?< zV>ye?9eq%OeSMksoe^o7fB7>CDmRi6|MN!)|F+-1=l}4Df~M-<75u$@`ftHsa|RM5 ze`%)P7QEe3`cw2HGF`pZV7e{*_X_%-q5uF6vOxO(uczPEbGxkirzUZ{|MwFAD6-zx za=SS6r Date: Sun, 14 Jan 2024 14:23:24 +0800 Subject: [PATCH 3/5] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=BA=86=E4=B8=80?= =?UTF-8?q?=E4=B8=8B=E5=86=85=E5=AE=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- readme.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/readme.md b/readme.md index 008a47c..7b1d98b 100644 --- a/readme.md +++ b/readme.md @@ -3,9 +3,9 @@ ## Function python+selenium爬取web of science(WOS)的信息 ## Base - 基于@wos-selenium的 + 基于[jinyangl312/wos-selenium](https://github.com/jinyangl312/wos-selenium) ## New additions - 在原有的基础上添加了获取一些重要信息的 + 在原有的基础上添加了获取一些重要信息的功能 ## Some tips: 我用的是校园网,无需登录 From 94b2fa5752360cf93cac9a240d8013eec9b3aa95 Mon Sep 17 00:00:00 2001 From: Yun <68860614+yunruxian@users.noreply.github.com> Date: Sun, 14 Jan 2024 14:30:33 +0800 Subject: [PATCH 4/5] Add files via upload --- readme.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/readme.md b/readme.md index 7b1d98b..1284674 100644 --- a/readme.md +++ b/readme.md @@ -1,14 +1,13 @@ # wos-selenium ## Function - python+selenium爬取web of science(WOS)的信息 +python+selenium爬取web of science(WOS)的信息 ## Base - 基于[jinyangl312/wos-selenium](https://github.com/jinyangl312/wos-selenium) +基于[jinyangl312/wos-selenium](https://github.com/jinyangl312/wos-selenium) ## New additions - 在原有的基础上添加了获取一些重要信息的功能 - +在原有的基础上添加了获取一些重要信息的功能 ## Some tips: - 我用的是校园网,无需登录 - 业余人员,有这个需求,所以基于别人的修改了一下 - 多线程时从wos系统上个下载引用(txt)再移动到指定文件夹中的功能会有错误,暂无能力解决 - 欢迎交流学习,请邮件jssyyrx@163.com \ No newline at end of file +我用的是校园网,无需登录 +业余人员,有这个需求,所以基于别人的修改了一下 +多线程时从wos系统上个下载引用(txt)再移动到指定文件夹中的功能会有错误,暂无能力解决 +欢迎交流学习,请邮件 \ No newline at end of file From 245f1ab9a8bf82e85d8dd88a338646cebfb0030a Mon Sep 17 00:00:00 2001 From: Yun <68860614+yunruxian@users.noreply.github.com> Date: Sun, 14 Jan 2024 14:31:53 +0800 Subject: [PATCH 5/5] Add files via upload --- readme.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/readme.md b/readme.md index 1284674..7f1d3f7 100644 --- a/readme.md +++ b/readme.md @@ -7,7 +7,7 @@ python+selenium爬取web of science(WOS)的信息 ## New additions 在原有的基础上添加了获取一些重要信息的功能 ## Some tips: -我用的是校园网,无需登录 -业余人员,有这个需求,所以基于别人的修改了一下 -多线程时从wos系统上个下载引用(txt)再移动到指定文件夹中的功能会有错误,暂无能力解决 -欢迎交流学习,请邮件 \ No newline at end of file +我用的是校园网,无需登录 +业余人员,有这个需求,所以基于别人的修改了一下 +多线程时从wos系统上个下载引用(txt)再移动到指定文件夹中的功能会有错误,暂无能力解决 +欢迎交流学习,请邮件 \ No newline at end of file