diff --git a/common.py b/common.py new file mode 100644 index 0000000..f21a46d --- /dev/null +++ b/common.py @@ -0,0 +1,265 @@ +import time +import os +import sys +from selenium.webdriver.common.by import By +from selenium.webdriver.support import wait, expected_conditions + + +def save_screenshot(driver, prefix, pic_path): + """Screenshot and save as a png""" + # 没有被调用,现在 + # paper_id + current_time + current_time = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time())) + driver.save_screenshot(f'{pic_path}{str(prefix)}_{current_time}.png') + + +# 用于任务记录 +class Check: + """ + 用于任务完成的标识和检查的类 + 一、该任务所有的工作是否已经完成 + 二、该任务系统提供的文件是否已经下载 + 三、该搜索结果的子页面是否处理结束 + 四、子页面:该搜索结果的子页面是否是否下载 + 五、子页面:该搜索结果的子页面信息是否已经提取结束 + """ + # 用于确定任务是否完成 + @staticmethod + def mark_task_finish_flag(path_school, query_task): + """Create a flag in the path to mark the task as completed.""" + with open(os.path.join(path_school, query_task, 'completed.flag'), 'w', encoding='utf-8') as f: + f.write('1') + return True + + @staticmethod + def check_task_finish_flag(path_school, query_task): + """Check if the flag in the path to check if task has been searched.""" + task_path = os.path.join(path_school, query_task) + return os.path.exists(task_path) and 'completed.flag' in os.listdir(str(task_path)) + + # 检查record_sys.txt是否已经下载 + @staticmethod + def check_task_downloaded_file(path_school, query_task): + task_path = os.path.join(path_school, query_task) + return os.path.exists(task_path) and 'record_sys.txt' in os.listdir(str(task_path)) + + # 用于确定子页面任务是否完成 + @staticmethod + def mark_subpage_done(path_school, query_task, current_url): + subpage_name = get_file_name(current_url) + with open( + os.path.join(path_school, query_task, f'{subpage_name}.flag'), + 'a+', encoding='utf-8') as file: + file.write(subpage_name) + file.write('\n') + + @staticmethod + def check_subpage_done(path_school, query_task, current_url): + subpage_name = get_file_name(current_url) + task_path = os.path.join(path_school, query_task) + return os.path.exists(task_path) and f'{subpage_name}.flag' in os.listdir(str(task_path)) + + # 用于搜索结果下载记录(每个任务下的每篇搜索文章) + @staticmethod + def check_subpage_downloaded(path_school, query_task, current_url): + subpage_name = get_file_name(current_url) + file_path = os.path.join(path_school, query_task) + if ( + os.path.exists(file_path) and f'{subpage_name}.html' in os.listdir(str(file_path)) + ) and ( + os.path.exists(file_path) and f'{subpage_name}.dat' in os.listdir(str(file_path)) + ): + return True + else: + return False + + # 用于搜索结果下载记录(每个任务下的每篇搜索文章) + @staticmethod + def mark_get_subpage_selected_info(path_school, query_task, sub_page_url): + with open(os.path.join(path_school, query_task, 'search_result_record.txt'), 'a+', encoding='utf-8') as file: + file.write(sub_page_url) + file.write('\n') + + @staticmethod + def check_get_subpage_selected_info(path_school, query_task, sub_page_url): + record_urls = set() + try: + with (open(os.path.join(path_school, query_task, 'search_result_record.txt'), 'r', encoding='utf-8') as file): + for record_url in file.readlines(): + record_url = record_url.strip("\n") + record_urls.add(record_url) + if sub_page_url in record_urls: + return True + else: + return False + except FileNotFoundError: + # 这是第一个处理的网页,所以'search_result_record.txt'还没建立起来 + return False + + @staticmethod + def check_total_handled(path_school, query_task, n_record): + record_urls = set() + with (open(os.path.join(path_school, query_task, 'search_result_record.txt'), 'r', encoding='utf-8') as file): + for record_url in file.readlines(): + record_url = record_url.strip("\n") + record_urls.add(record_url) + if len(record_urls) == n_record: + return True + else: + return False + + +def roll_down(driver, fold=40): + """ + Roll down to the bottom of the page to load all results + """ + # fold 倍数或者次数,就是下移500,重复40次 + for i_roll in range(1, fold + 1): + time.sleep(0.1) + driver.execute_script(f'window.scrollTo(0, {i_roll * 500});') + + +def show_more(driver): + """ + show more information hided + """ + # Show all authors et al. + element = wait.WebDriverWait(driver, 20).until( + expected_conditions.element_to_be_clickable( + ( + By.XPATH, + '//button[@id="HiddenSecTa-showMoreDataButton"]' + ))) + driver.execute_script("arguments[0].click();", element) + # 一些页面可以定位,一些页面死活定位不了 + # driver.find_element( + # By.XPATH, + # '/button[@id="HiddenSecTa-showMoreDataButton"]' + # ).click() + try: + driver.find_element(By.XPATH, '//*[text()="...More"]').click() + except: + pass + try: + driver.find_element(By.XPATH, '//*[text()=" ...more addresses"]').click() + except: + pass + + +def check_if_human(driver): + """ + 检查是否出现机器人检测 + """ + try: + driver.find_element( + By.XPATH, + '//*[contains(text(),"Please verify you are human to proceed."]' + ) + raise Exception("机器人检测出来了") + except: + try: + driver.find_element( + By.XPATH, + '//*[contains(text(),"我是人类"]' + ) + raise Exception("机器人检测出来了") + except: + pass + + +def get_file_name(current_url): + subpage_id = current_url.replace("https://webofscience.clarivate.cn/wos/alldb/full-record/", "") + pre, suf = subpage_id.split(":") + return pre + "_" + suf + + +def decorator(func): + """ + 装饰器,作用就是看是否有返回值,没有就使得返回值取值为 + """ + def wrapper(*args, **kwargs): + try: + result = func(*args, **kwargs) + except: + result = "" + return result + + return wrapper + + +@decorator +def get_element_text(driver, match_condition): + """ + 应用装饰器 + """ + info = driver.find_element( + By.XPATH, match_condition + ).text + return info + + +def close_pendo_windows(driver): + """Close guiding windows""" + # Cookies + try: + driver.find_element(By.XPATH, '//*[@id="onetrust-accept-btn-handler"]').click() + except: + pass + # "reminder me later" + try: + driver.find_element(By.XPATH, '//*[@id="pendo-close-guide-5600f670"]').click() + except: + pass + # "Step 1 of 2" + try: + driver.find_element(By.XPATH, '//*[@id="pendo-close-guide-30f847dd"]').click() + except: + pass + # "Got it" + try: + driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-primaryButton")]').click() + except: + pass + # "No thanks" + try: + driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-secondaryButton")]').click() + except: + pass + # What was it... I forgot... + try: + driver.find_element(By.XPATH, '//span[contains(@class, "_pendo-close-guide")').click() + except: + pass + # Overlay + try: + driver.find_element(By.XPATH, '//div[contains(@class, "cdk-overlay-container")]').click() + except: + pass + # Overlay dialog + try: + driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-close-guide")]').click() + except: + pass + try: + driver.find_element(By.XPATH, '//button[contains(text(), "全部允许")]').click() + except: + pass + + +# 显示进度 +def show_progress(total, current): + percent = 100*current/total + progress = "█" * int(percent/10) + sys.stdout.write(f"\r{percent:3.0f}%|{progress:<10}| {current}/{total}") + sys.stdout.flush() + + +# 遍历文件夹中的所有文件 +def list_all_files(path): + all_file_list = list() + for root, dirs, files in os.walk(path): + for file in files: + file_path = os.path.join(root, file) + print(file_path) + all_file_list.append(file_path) + return all_file_list diff --git a/crawl.py b/crawl.py index c818fe3..640e9de 100644 --- a/crawl.py +++ b/crawl.py @@ -1,18 +1,82 @@ # encoding: utf-8 -from logging import handlers import pathlib import shutil import time import logging import os import re -import tqdm +import json +import pandas as pd +from selenium.common import NoSuchElementException from selenium.webdriver.common.by import By from selenium.webdriver.support import wait, expected_conditions +import common +from selenium import webdriver + +check = common.Check + + +# 定义开始函数 +def start(school, query_task, default_download_path, others_information): + + # 设置浏览器options + options = webdriver.ChromeOptions() + options.add_argument("--disable-infobars") + options.add_argument("--window-size=1920,1080") + options.add_experimental_option('useAutomationExtension', False) + options.add_experimental_option('excludeSwitches', ['enable-automation']) + # 去掉浏览器上的正在受selenium控制 + options.add_argument("--disable-blink-features=AutomationControlled") + + # 看文件路径是否包涵文件名词 + if not default_download_path.endswith("/savedrecs.txt"): + default_download_path += "/savedrecs.txt" + + # driver.get("https://webofscience.clarivate.cn/") + # wait_for_login(driver) + # switch_language_to_eng(driver) + + path_school = os.path.join("downloads", school) + + # 检查路径是否存在,该任务是否已经处理过 + if path_school is not None and check.check_task_finish_flag(path_school, query_task): + return False + + # 实例化并运行程序 + driver = webdriver.Chrome(options=options) + # 页面放大 + driver.maximize_window() + + # Search query + if not search_query(driver, path_school, query_task): + # Stop if download failed for some reason + return False + + # 从页面下载部分 + # 判断这个搜索任务是否已经下载过WOS提供的下载文件了 + # 在多线程情况下,下面无法使用,因为无法保证某一线程获得其线程下载的文件 + # 并且由于线程较快,使得文件名变化 + # if not check.check_task_downloaded_file(path_school, query_task): + # # Download the search results using inner website function + # if not download_search_results(driver, default_download_path): + # return False + # # Deal with the downloaded search results file(.txt) + # if not deal_with_downloaded_file(driver, default_download_path, path_school, query_task): + # return False + + # 打开每个页面下载页面和爬取信息部分 + # Deal with records + if not deal_with_records(driver, path_school, query_task, others_information): + return False + + # Search completed + if path_school is not None: + check.mark_task_finish_flag(path_school, query_task) + driver.quit() def wait_for_login(driver): - '''Wait for the user to login if wos cannot be accessed directly.''' + """Wait for the user to login if wos cannot be accessed directly.""" try: driver.find_element(By.XPATH, '//div[contains(@class, "shibboleth-login-form")]') input('Login before going next...\n') @@ -20,99 +84,66 @@ def wait_for_login(driver): pass -def switch_language_to_Eng(driver): - '''Switch language from zh-cn to English.''' +def switch_language_to_eng(driver): + """Switch language from zh-cn to English.""" wait.WebDriverWait(driver, 10).until( expected_conditions.presence_of_element_located((By.XPATH, '//*[contains(@name, "search-main-box")]'))) - close_pendo_windows(driver) + # # 先搞掉一些提示窗口 + common.close_pendo_windows(driver) + try: driver.find_element(By.XPATH, '//*[normalize-space(text())="简体中文"]').click() driver.find_element(By.XPATH, '//button[@lang="en"]').click() except: - close_pendo_windows(driver) + common.close_pendo_windows(driver) driver.find_element(By.XPATH, '//*[normalize-space(text())="简体中文"]').click() driver.find_element(By.XPATH, '//button[@lang="en"]').click() -def close_pendo_windows(driver): - '''Close guiding windows''' - # Cookies - try: - driver.find_element(By.XPATH, '//*[@id="onetrust-accept-btn-handler"]').click() - except: - pass - # "Got it" - try: - driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-primaryButton")]').click() - except: - pass - # "No thanks" - try: - driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-secondaryButton")]').click() - except: - pass - # What was it... I forgot... - try: - driver.find_element(By.XPATH, '//span[contains(@class, "_pendo-close-guide")').click() - except: - pass - # Overlay - try: - driver.find_element(By.XPATH, '//div[contains(@class, "cdk-overlay-container")]').click() - except: - pass - # Overlay dialog - try: - driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-close-guide")]').click() - except: - pass - - -def mark_flag(path): - '''Create a flag in the path to mark the task as completed.''' - with open(os.path.join(path, 'completed.flag'), 'w') as f: - f.write('1') - - -def check_flag(path): - '''Check if the flag in the path to check if task has been searched.''' - return os.path.exists(path) and 'completed.flag' in os.listdir(path) - - -def search_query(driver, path, query): - '''Go to advanced search page, insert query into search frame and search the query.''' - if not path == None: +def search_query(driver, path, query_task): + """ + Go to advanced search page, insert query into search frame and search the query. + """ + # 看path是否是假的值,如果不是就继续,建立对应文件夹,不然用open with 时就会出错 + if path is not None: os.makedirs(path, exist_ok=True) - logging.info(path) + logging.info(f"{path}文件夹已经建立") + second_path = os.path.join(path, f'{query_task}') + os.makedirs(second_path, exist_ok=True) + logging.info(f"{path}-{query_task}文件夹已经建立") # Close extra windows if not len(driver.window_handles) == 1: handles = driver.window_handles - for i_handle in range(len(handles)-1, 0, -1): # traverse in reverse order + for i_handle in range(len(handles) - 1, 0, -1): # traverse in reverse order # Switch to the window and load the page driver.switch_to.window(handles[i_handle]) driver.close() driver.switch_to.window(handles[0]) - ## Search query - driver.get("https://www.webofscience.com/wos/alldb/advanced-search") + # Search query + driver.get("https://webofscience.clarivate.cn/wos/alldb/advanced-search") max_retry = 3 retry_times = 0 - while True: + while True: try: - close_pendo_windows(driver) + common.close_pendo_windows(driver) # Load the page wait.WebDriverWait(driver, 10).until( - expected_conditions.presence_of_element_located((By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Clear "]'))) + expected_conditions.presence_of_element_located( + (By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Clear "]') + ) + ) # Clear the field driver.find_element(By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Clear "]').click() # Insert the query - driver.find_element(By.XPATH, '//*[@id="advancedSearchInputArea"]').send_keys("{}".format(query)) + driver.find_element(By.XPATH, '//*[@id="advancedSearchInputArea"]').send_keys("{}".format(query_task)) # Click on the search button - driver.find_element(By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Search "]').click() + driver.find_element(By.XPATH, + '//span[contains(@class, "mat-button-wrapper") and text()=" Search "]').click() break except: retry_times += 1 @@ -122,18 +153,21 @@ def search_query(driver, path, query): else: # Retry logging.debug("Search retrying") + # Wait for the query page try: - wait.WebDriverWait(driver, 5).until( + # 根据文章链接判断是否加载成功 + wait.WebDriverWait(driver, 10).until( expected_conditions.presence_of_element_located((By.CLASS_NAME, 'title-link'))) - except: + except: try: # No results - driver.find_element(By.XPATH, '//*[text()="Your search found no results"]') - logging.warning(f'Your search found no results') + driver.find_element(By.XPATH, '//*[text()="No records were found to match your filters"]') + logging.warning('No records were found to match your filters') # Mark as completed - if not path == None: - mark_flag(path) + # 没有搜索结果时的任务标记 + if path is not None: + check.mark_task_finish_flag(path, query_task) return False except: # Search failed @@ -144,231 +178,512 @@ def search_query(driver, path, query): return True -def download_outbound(driver, default_download_path): - '''Export the search results as outbound. The file is downloaded to default path set for the system.''' +def download_search_results(driver, default_download_path): + """ + Export the search results using inner website function. The file is downloaded to default path set for the system. + """ max_retry = 3 retry_times = 0 - while True: - close_pendo_windows(driver) + while True: + time.sleep(0) + common.close_pendo_windows(driver) # Not support search for more than 1000 results yet - assert int(driver.find_element(By.XPATH, '//span[contains(@class, "end-page")]').text) < 1000, "Sorry, too many results!" + assert int(driver.find_element(By.XPATH, + '//span[contains(@class, "end-page")]').text) < 1000, "Sorry, too many results!" # File should not exist on default download folder - assert not os.path.exists(default_download_path), "File existed on default download folder!" - try: + # assert not os.path.exists(default_download_path), "File existed on default download folder!" + try: + common.close_pendo_windows(driver) # Click on "Export" - driver.find_element(By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Export "]').click() + driver.find_element(By.XPATH, + '//span[contains(@class, "mat-button-wrapper") and text()=" Export "]').click() # Click on "Plain text file" try: - driver.find_element(By.XPATH, '//button[contains(@class, "mat-menu-item") and text()=" Plain text file "]').click() + driver.find_element(By.XPATH, + '//button[contains(@class, "mat-menu-item") and text()=" Plain text file "]' + ).click() except: - driver.find_element(By.XPATH, '//button[contains(@class, "mat-menu-item") and @aria-label="Plain text file"]').click() + driver.find_element(By.XPATH, + '//button[contains(@class, "mat-menu-item") and @aria-label="Plain text file"]' + ).click() # Click on "Records from:" driver.find_element(By.XPATH, '//*[text()[contains(string(), "Records from:")]]').click() # Click on "Export" driver.find_element(By.XPATH, '//span[contains(@class, "ng-star-inserted") and text()="Export"]').click() # Wait for download to complete for retry_download in range(4): - time.sleep(2) + time.sleep(0.5) try: # If there is any "Internal error" wait.WebDriverWait(driver, 2).until( - expected_conditions.presence_of_element_located((By.XPATH, '//div[text()="Server encountered an internal error"]'))) + expected_conditions.presence_of_element_located( + (By.XPATH, '//div[text()="Server encountered an internal error"]'))) driver.find_element(By.XPATH, '//div[text()="Server encountered an internal error"]') - driver.find_element(By.XPATH, '//*[contains(@class, "ng-star-inserted") and text()="Export"]').click() + driver.find_element(By.XPATH, + '//*[contains(@class, "ng-star-inserted") and text()="Export"]').click() except: + # 下载成功了就退出循环 if os.path.exists(default_download_path): break # Download completed assert os.path.exists(default_download_path), "File not found!" return True except: - retry_times += 1 + retry_times += 1 if retry_times > max_retry: logging.error("Crawl outbound exceeded max retries") return False else: # Retry logging.debug("Crawl outbound retrying") - close_pendo_windows(driver) + common.close_pendo_windows(driver) # Click on "Cancel" try: - driver.find_element(By.XPATH, '//*[contains(@class, "mat-button-wrapper") and text()="Cancel "]').click() + driver.find_element(By.XPATH, + '//*[contains(@class, "mat-button-wrapper") and text()="Cancel "]').click() except: driver.refresh() - time.sleep(1) + time.sleep(0) wait.WebDriverWait(driver, 10).until( expected_conditions.presence_of_element_located((By.CLASS_NAME, 'title-link'))) continue -def process_outbound(driver, default_download_path, dst_path): - '''Process the outbound downloaded to the default path set for the system.''' - +def deal_with_downloaded_file(driver, default_download_path, dst_path, query_task): + """ + 将下载的结果转移至当前路径中 + 检查下载文件中的文献条目数目是否等于搜索出来的文献条目数据 + Process the outbound downloaded to the default path set for the system. + """ # Move the outbound to dest folder assert os.path.exists(default_download_path), "File not found!" + # 判断目标时路径还是文件 if pathlib.Path(dst_path).is_dir(): - dst_path = os.path.join(dst_path, 'record.txt') + dst_path = os.path.join(dst_path, f'{query_task}\\record_sys.txt') + # 该函数功能就是移动 shutil.move(default_download_path, dst_path) logging.debug(f'Outbound saved in {dst_path}') - # Load the downloaded outbound (for debug) - with open(dst_path, "r", encoding='utf-8') as f_outbound: - n_record_ref = len(re.findall("\nER\n", f_outbound.read())) - assert n_record_ref == int("".join(driver.find_element(By.XPATH, '//span[contains(@class, "brand-blue")]').text.split(","))), "Records num do not match outbound num" + # Load the downloaded file (for debug) + # 检查下载的文献条目数目是否等于网页上显示的搜索文献数目 + with open(dst_path, "r", encoding='utf-8') as f_file: + n_record_ref = len(re.findall("\nER\n", f_file.read())) + assert n_record_ref == int("".join( + driver.find_element(By.XPATH, '//span[contains(@class, "brand-blue")]').text.split( + ","))), "Records num do not match outbound num" return True -def download_record(driver, path, records_id): - '''Download the page to the path''' - # Load the page or throw exception - wait.WebDriverWait(driver, 10).until( - expected_conditions.presence_of_element_located((By.XPATH, '//h2[contains(@class, "title")]'))) - - # Download the record - with open(os.path.join(path, f'record-{records_id}.html'), 'w', encoding='utf-8') as file: - file.write(driver.page_source) - logging.debug(f'record #{records_id} saved in {path}') - - -def process_record(driver, path, records_id): - '''Parse a page to get certain statistics''' - # Show all authors and save raw data - try: - driver.find_element(By.XPATH, '//*[text()="...More"]').click() - except: - pass - with open(os.path.join(path, f'record-{records_id}.dat'), 'w', encoding='utf-8') as file: - file.write(driver.page_source) - logging.debug(f'record #{records_id} saved in {path}') - - -def roll_down(driver, fold = 40): - '''Roll down to the bottom of the page to load all results''' - for i_roll in range(1, fold+1): - time.sleep(0.1) - driver.execute_script(f'window.scrollTo(0, {i_roll * 500});') - - -def save_screenshot(driver, prefix, pic_path): - """Screenshot and save as a png""" - - # paper_id + current_time - current_time = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time())) - driver.save_screenshot(f'{pic_path}{str(prefix)}_{current_time}.png') - - -def process_windows(driver, path, records_id): - '''Process all subpages''' - handles = driver.window_handles - has_error = False - for i_handle in range(len(driver.window_handles)-1, 0, -1): # traverse in reverse order - # Switch to the window and load the page - driver.switch_to.window(handles[i_handle]) - close_pendo_windows(driver) - try: - download_record(driver, path, records_id) - process_record(driver, path, records_id) - except: - logging.error("Record downloading failed!") - has_error = True - records_id += 1 - driver.close() - driver.switch_to.window(handles[0]) - return len(handles) - 1 if not has_error else -1 - +def deal_with_records(driver, path, query_task, others_information): + """ + Open records as new subpages, download or parse subpages according to the setting. + add a function: 将 -def process_records(driver, path): - '''Open records as new subpages, download or parse subpages according to the setting.''' + open each search result(record) as a new subpage + deal with subpages(using function:process_windows();用windows是因为只处理已经打开的subpage,也就相当于桌,所以需要重复调用): + download all subpages + get_subpage_inf_wanted(整理页面内所有的信息) + """ # init + # 计算有多少个子网页需要打开 n_record = int(driver.find_element(By.XPATH, '//span[contains(@class, "brand-blue")]').text) n_page = (n_record + 50 - 1) // 50 assert n_page < 2000, "Too many pages" logging.debug(f'{n_record} records found, divided into {n_page} pages') - + + # 搜索结果计数 records_id = 0 + # 创建一个集合 url_set = set() + for i_page in range(n_page): + # 当前浏览器有多少个窗口?等于1,否则出错 assert len(driver.window_handles) == 1, "Unexpected windows" - roll_down(driver) - + + # 先到低,使得所有的summary-record-title-link都展现出来,以便获取元素集 + common.roll_down(driver) + # Open every record in a new window windows_count = 0 for record in driver.find_elements(By.XPATH, '//a[contains(@data-ta, "summary-record-title-link")]'): + # 判断该搜索结果(记录)是否已经打开过 if record.get_attribute("href") in url_set: - # coz some records have more than 1 href link - continue + # coz some records have more than 1 href link + continue else: - url_set.add(record.get_attribute("href")) - time.sleep(0.5) + url_set.add(record.get_attribute("href")) + # 新加的一个判断,功能和上面一样的 + # 看这个页面搞完了没 + current_url = record.get_attribute("href") + if check.check_subpage_done(path, query_task, current_url): + continue + # open one pages driver.execute_script(f'window.open(\"{record.get_attribute("href")}\");') + time.sleep(1) windows_count += 1 + # 一个条件是是否到10个了。另一个条件是否是5的倍数。 + # 也就是说,只有10,15,20等才会处理 + # 先要多线程的话,好像只能这么写 if windows_count >= 10 and not windows_count % 5: # Save records and close windows - increment = process_windows(driver, path, records_id) + # 返回值是处理了几个网页 + increment = process_windows(driver, path, query_task, others_information) if increment != -1: records_id += increment else: return False - time.sleep(5) - + time.sleep(0) + # 这应该是最后一页,单着的几页。比如53个搜索结果的剩余3个。 # Save records and close windows - increment = process_windows(driver, path, records_id) + increment = process_windows(driver, path, query_task, others_information) if increment != -1: records_id += increment else: return False # Go to the next page - if i_page + 1 < n_page: - driver.find_element(By.XPATH, '//mat-icon[contains(@svgicon, "arrowRight")]').click() - return True - + if i_page + 1 < n_page: + element = wait.WebDriverWait(driver, 20).until( + expected_conditions.element_to_be_clickable( + ( + By.XPATH, + '//button[contains(@data-ta, "next-page-button")]' + ))) + driver.execute_script("arguments[0].click();", element) + # 根据下载的数据数目是否与查询到的数目对应,确定返回值 + if check.check_total_handled(path, query_task, n_record): + return True + else: + logging.error("Record handled num does equate the num searched!") + return False + + +def process_windows(driver, path, query_task, others_information): + """ + Process all subpages + records_id: the number of the search result + path: path of the task + """ + handles = driver.window_handles + has_error = False + for i_handle in range(len(driver.window_handles) - 1, 0, -1): # traverse in reverse order + # Switch to the window and load the page + driver.switch_to.window(handles[i_handle]) # 先打开最后一个 + common.close_pendo_windows(driver) + current_url = driver.current_url + time.sleep(1) + # 展开页面隐藏内容 + common.show_more(driver) + # 下载页面 + if not check.check_subpage_downloaded(path, query_task, current_url): + if not download_subpage(driver, path, query_task): + logging.error(f"Page({current_url}) download mistake!") + has_error = True + driver.close() + continue + # 获得整理后的信息 + if not check.check_get_subpage_selected_info(path, query_task, current_url): + if not get_subpage_inf_wanted(driver, path, query_task, others_information): + logging.error(f"Page({current_url}) information get mistake!") + has_error = True + driver.close() + continue + driver.close() + # 标一下,这个页面搞完了 + check.mark_subpage_done(path, query_task, current_url) -def start_session(driver, task_list, default_download_path): - ''' - Start the search of all tasks. - driver: the handle of a selenium.webdriver object - task_list: the zip of save paths and advanced query strings - default_download_path: the default path set for the system, for example, C://Downloads/ - ''' - - # Init - os.makedirs('logs', exist_ok=True) - logging.basicConfig(level=logging.INFO, - filename=os.getcwd() + '/logs/log' + time.strftime('%Y%m%d%H%M', - time.localtime(time.time())) + '.log', - filemode="w", - format="%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s" - ) + driver.switch_to.window(handles[0]) + # 如果无误的话,返回len(handles)-1,也就是打开了多少个窗口 否则返回-1 + return len(handles) - 1 if not has_error else -1 - if not default_download_path.endswith("/savedrecs.txt"): - default_download_path += "/savedrecs.txt" - driver.get("https://www.webofscience.com/") - wait_for_login(driver) - # switch_language_to_Eng(driver) - # Start Query - for path, query in tqdm.tqdm(task_list): - if not path == None and check_flag(path): continue +def download_subpage(driver, path, query_task): + """ + Download the page to the path + """ + try: + # Load the page or throw exception + wait.WebDriverWait(driver, 10).until( + expected_conditions.presence_of_element_located((By.XPATH, '//h2[contains(@class, "title")]'))) + + current_url = driver.current_url + subpage_name = common.get_file_name(current_url) + + # Download the record + with open(os.path.join(path, query_task, f"{subpage_name}.html"), 'w', encoding='utf-8') as file: + file.write(driver.page_source) + logging.debug(f'record # {subpage_name} saved in {path}/{query_task}') + + # Download the record in dat + with open(os.path.join(path, query_task, f"{subpage_name}.dat"), 'w', encoding='utf-8') as file: + file.write(driver.page_source) + logging.debug(f'record # {subpage_name} saved in {path}/{query_task}') + return True + except: + return False - # Search query - if not search_query(driver, path, query): - # Stop if download failed for some reason - continue - # Download the outbound - if not download_outbound(driver, default_download_path): +def get_subpage_inf_wanted(driver, path, query_task, others_information): + """ + 爬取信息以下信息:论文名词,期刊,发表时间,作者,单位,类型, + """ + try: + # 网页 + current_url = driver.current_url + # 论文名词 + title_en = driver.find_element( + By.XPATH, '//*[@id="FullRTa-fullRecordtitle-0" and @lang="en"]' + ).text.capitalize() + title_zh_cn = common.get_element_text( + driver, '//*[@id="FullRTa-fullRecordtitle-0" and @lang="zh-cn"]' + ) + # 期刊名 无连接型 + # nolink和link不一样,一个是span,一个是a + try: + try: + journal_en = driver.find_element( + By.XPATH, + '//span[contains(@class, "summary-source-title") and @lang="en"]' + ).text.capitalize() + journal_zh_cn = common.get_element_text( + driver, + '//span[contains(@class, "summary-source-title") and @lang="zh-cn"]' + ) + except: + # 期刊名 有连接 + journal_en = driver.find_element( + By.XPATH, + '//a[contains(@class, "summary-source-title-link") and @lang="en"]' + ).text.capitalize() + journal_zh_cn = common.get_element_text( + driver, + '//a[contains(@class, "summary-source-title-link") and @lang="zh-cn"]' + ) + except: + journal_en = "该文没有期刊名称,请注意文章类型(Document Type)" + journal_zh_cn = "" + # 作者信息_英文 + authors_info_en = get_authors_info(driver, language="en") + # 中文相关信息 + authors_info_zh_cn = get_authors_info(driver, language="zh_cn") + # 出版日期 + published_date = common.get_element_text(driver, '//span[@name="pubdate"]') + # 检索日期 + indexed_date = common.get_element_text(driver, '//span[@name="indexedDate"]') + # 文章类型 + document_type = common.get_element_text(driver, '//span[@id="FullRTa-doctype-0"]') + # volume + volume = common.get_element_text(driver, '//span[@id="FullRTa-volume"]') + # Issue + issue = common.get_element_text(driver,'//span[@id="FullRTa-issue"]') + # pagenum + pagenum = common.get_element_text(driver, '//span[@id="FullRTa-pageNo"]') + # doi + doi = common.get_element_text(driver, '//span[@id="FullRTa-DOI"]') + # 摘要——英文 + abstract_en = common.get_element_text( + driver, '//div[@id="FullRTa-abstract-basic" and @lang = "en"]/p') + # 摘要——中文 + abstract_zh_cn = common.get_element_text( + driver, '//div[@id="FullRTa-abstract-basic" and @lang = "zh-cn"]/p') + # language + language = common.get_element_text(driver, '//span[@id="HiddenSecTa-language-0"]') + # cited num + try: + cited_num = driver.find_element( + By.XPATH, '//*[contains(@id, "FullRRPTa-wos-citation-network-times-cited-count-link")]' + ).text + except: + cited_num = 0 + # corresponding author + corresponding_author_set = set() + for i in range(5): + try: + corresponding_author_elements = wait.WebDriverWait(driver, 10).until( + expected_conditions.presence_of_all_elements_located( + (By.XPATH, f'//div[@id="FRAiinTa-RepAddrTitle-{i}"]/div/div') + ) + ) + for corresponding_author_element in corresponding_author_elements: + corresponding_author = corresponding_author_element.find_element( + By.XPATH, + './/span[@class="value"]' + ).text + corresponding_author_set.add(corresponding_author) + except: + try: + corresponding_author = wait.WebDriverWait(driver, 10).until( + expected_conditions.presence_of_element_located( + (By.XPATH, f'//div[@id="FRAiinTa-RepAddrTitle-{i}"]/div/div/span[@class="value"]') + ) + ).text + corresponding_author_set.add(corresponding_author) + except: + break + corresponding_author_list = list(corresponding_author_set) + + subpage_inf = dict( + school_id=str(others_information["学校ID"]), + school=path, + teacher_id=str(others_information["教师ID"]), + teacher_name=others_information["姓名"], + current_url=current_url, + title=dict(title_en=title_en, title_zh_cn=title_zh_cn), + journal=dict(journal_en=journal_en, journal_zh_cn=journal_zh_cn), + authors_info=dict(authors_info_en=authors_info_en, authors_info_zh_cn=authors_info_zh_cn), + corresponding_author_list=corresponding_author_list, + cited_num=cited_num, + published_date=published_date, + indexed_date=indexed_date, + volume=volume, + issue=issue, + pagenum=pagenum, + doi=doi, + document_type=document_type, + abstract=dict(abstract_en=abstract_en, abstract_zh_cn=abstract_zh_cn), + language=language, + time_data=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), + ) + + # 写入json + with open(os.path.join(path, query_task, 'search_results_information_got.txt'), 'a', encoding='utf-8') as file: + json.dump(subpage_inf, file, indent=4, ensure_ascii=False, allow_nan=True) + file.write("\n") + logging.debug(f'record #{current_url} dict saved in {path}-{query_task}') + file.close(), + + # 写入json + with open(os.path.join(path, query_task, 'search_results_information_got.json'), 'a', encoding='utf-8') as file: + json.dump(subpage_inf, file, ensure_ascii=False, allow_nan=True) + file.write("\n") + logging.debug(f'record #{current_url} dict saved in {path}-{query_task}') + file.close() + + # mark一下,这个网页爬下来了 + check.mark_get_subpage_selected_info(path, query_task, current_url) + return True + except: + return False + + +def get_authors_info(driver, language="en"): + """ + 获得作者信息部分太长了,且可重复,所以单独设为一个函数 + """ + authors_info = list() + + author_elements = driver.find_elements( + By.XPATH, + '//div[@id="SumAuthTa-MainDiv-author-{lang}"]/span/span[@class="value ng-star-inserted"]'.format(lang=language) + ) + # find_elements没找到不会报错,只会返回空列表 + # 没有的话就返回空字典,对应的情况是:全英文,可能没有中文 + if not author_elements: + author_info = dict( + author_order="", + author_name_dis="", + author_name_std="", + author_addresses=[], + author_email="", + ) + authors_info.append(author_info) + return authors_info + # 有的话就继续 + author_order = 0 + for author_element in author_elements: + # 展示名 + author_name_dis = author_element.find_element( + By.XPATH, + './/a[@id="SumAuthTa-DisplayName-author-{lang}-{order}"]'.format(order=author_order, lang=language) + ).text + # 标准名 + author_name_std = common.get_element_text( + author_element, + './/span[@id="SumAuthTa-FrAuthStandard-author-{lang}-{order}"]/span'.format( + order=author_order, lang=language + ) + ) + author_addresses = list() + + # 作者对应地址 + try: + address_elements = author_element.find_elements( + By.XPATH, + './/a[contains(@class,"address_link")]' + ) + except NoSuchElementException: + address_elements = [author_order] + + for address_element in address_elements: + # 得到address编号 + # [1]是个list + try: + address_order = address_element[0] + except TypeError: + address_order = address_element.text.strip() + address_order = str(address_order).split("[")[1].split("]")[0] + # 通过编号得到对应的地址 + try: + address = driver.find_element( + By.XPATH, + '//*[@id="address_{}"]/span[2]'.format(address_order) + ).text + author_addresses.append(address) + except: + author_addresses.append("该文未列出地址信息!") + # 邮箱 + try: + author_email = driver.find_element( + By.XPATH, + '//a[@id="FRAiinTa-AuthRepEmailAddr-{}"]'.format(author_order) + ).text + except: + author_email = "注意:该文章作者邮箱与作者并不对应!" + author_order += 1 + author_info = dict( + author_order=author_order, + author_name_dis=author_name_dis, + author_name_std=author_name_std, + author_addresses=author_addresses, + author_email=author_email, + ) + authors_info.append(author_info) + return authors_info + + +def json_to_excel(file_path): + """ + 将下载的json数据转成excel + """ + # 从json文件中加载数据 + # 读取JSON数据并解析为Python数据结构 + with open(os.path.join(file_path, 'search_results_information_got.json'), 'r', encoding='utf-8') as file: + rows = [] + for line in file: + data = json.loads(line) + rows.append(data) + + # 将Python数据结构转换为DataFrame + df = pd.json_normalize(rows) + + # 将DataFrame保存为Excel文件 + df.to_excel(os.path.join(file_path, 'search_results_information_got.xlsx'), index=False) + + +def combine_excel(path): + data_list = [] + # os.listdir(".")返回目录中的文件名列表 + for file in common.list_all_files(path): + # 判断文件名以".xlsx"结尾 + if file.endswith("search_results_information_got.xlsx"): + # pd.read_excel(filename)读取Excel文件,返回一个DataFrame对象 + # 列表名.append将DataFrame写入列表 + data_list.append(pd.read_excel(file)) + else: continue - # Deal with the outbound - if not process_outbound(driver, default_download_path, path): - continue + # concat合并Pandas数据 + data_all = pd.concat(data_list) + # 将 DataFrame 保存为 excel 文件 + data_all.to_excel("all_results.xlsx", index=False) - # Deal with records - if not process_records(driver, path): - continue - # Search completed - if not path == None: - mark_flag(path) - - driver.quit() diff --git a/main.py b/main.py index c057073..abee9a8 100644 --- a/main.py +++ b/main.py @@ -1,20 +1,106 @@ # encoding: utf-8 -from crawl import * -from selenium import webdriver - -if __name__ == '__main__': - ################ Set up parameters here ##################### - default_download_path = "C://Users/bigwh/Downloads" + "/savedrecs.txt" - # The first string should be the path where your file is downloaded to by default. - # Most likely, it should be like: "C://Users/usr_nm/Downloads" - task_list = [ # folder_name, query - ["results/search_1", "TI=(pFind) AND PY=(2016-2022)"], - ["results/search_2", "TI=(Attention is All you Need)"] - ] - # These are the tasks to be searched - driver = webdriver.Chrome( - executable_path='C://Program Files//Google//Chrome//Application//chromedriver.exe' - # This is the path where you place your chromedriver - ) - ############################################################# - start_session(driver, task_list, default_download_path) +import common +import crawl +import pandas as pd +import concurrent.futures +from concurrent.futures import ThreadPoolExecutor +import tqdm +import logging +import os +import time + +################ Set up parameters here ##################### +# 浏览器默认下载路径设置 +# The first string should be the path where your file is downloaded to by default. +# Most likely, it should be like: "C://Users/usr_nm/Downloads" +default_download_path = "C:/Users/yunruxian/Downloads" + "/savedrecs.txt" + +# 搜索任务所在excel文件路径设置 +task_path = "raw data file/teachers‘list(20240106).xls" + +# 读取搜索任务文件 +df = pd.read_excel( + task_path, + sheet_name="Sheet1", + header=0, + keep_default_na=False, +) + +# 获得搜索任务列表 +task_list = [] +for i in range(0, len(df)): + school_id = df.iloc[i]["学校ID"] + school = df.iloc[i]["学校"] + teacher_id = df.iloc[i]["教师ID"] + teacher_name_cn = df.iloc[i]["姓名"] + teacher_name_en = df.iloc[i]["name"] + address_en = df.iloc[i]["address"] + address_en_plus = df.iloc[i]["address_plus"] + others_information = {"学校ID": school_id, "教师ID": teacher_id, "姓名": teacher_name_cn} + task_list.append([f"{school}", F"AU='{teacher_name_en}' AND AD='{address_en}'", others_information]) + if address_en_plus: + task_list.append([f"{school}", F"AU='{teacher_name_en}' AND AD='{address_en_plus}'", others_information]) + +""" +Start the search of all tasks. +driver: the handle of a selenium.webdriver object +task_list: the zip of save paths and advanced query strings +default_download_path: the default path set for the system, for example, C://Downloads/ +""" + +# Init +"""设置logging""" +os.makedirs('logs', exist_ok=True) +logging.basicConfig(level=logging.INFO, + filename=os.getcwd() + '/logs/log' + time.strftime('%Y%m%d%H%M', + time.localtime(time.time())) + '.log', + filemode="w", + format="%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s" + ) + +if __name__ == '__main__': + threader = ThreadPoolExecutor(max_workers=5) + remaining_tasks = [] + # Start Query + # tqdm.tqdm(task_list)生成一个由迭代对象组成的进度条 + for school_folder, query_task, others_information in tqdm.tqdm(task_list): + # 传统爬取命令 + # crawl.start(school_folder, query_task, default_download_path, others_information) + # 多线程命令 + future = threader.submit(crawl.start, school_folder, query_task, default_download_path, others_information) + remaining_tasks.append(future) + + # 线程计数 + while True: + for future in remaining_tasks: + if future.done(): + remaining_tasks.remove(future) + common.show_progress(4118, (4118 - len(remaining_tasks))) + # 所有任务完成时就退出循环 + if len(remaining_tasks) == 0: + break + + # 等待所有的线程对象完成 + # wait的参数就是一个列表,运行到这里时,列表里已经没有对象了,所以下面这条命令重复 + # concurrent.futures.wait(remaining_tasks) + threader.shutdown() + + # 检查是否全部完成 + for school_folder, query_task, others_information in tqdm.tqdm(task_list): + path_school = os.path.join("downloads", school_folder) + if not common.Check.mark_task_finish_flag(path_school, query_task): + print(school_folder, query_task) + + # trans form json to excel + for school_folder, query_task, others_information in tqdm.tqdm(task_list): + if os.path.join( + "downloads", school_folder, query_task, 'search_results_information_got.xlsx' + ) not in os.listdir( + os.path.join("downloads", school_folder, query_task) + ): + try: + crawl.json_to_excel(os.path.join("downloads", school_folder, query_task)) + except: + continue + + crawl.combine_excel("downloads") diff --git a/raw data file/example.xlsx b/raw data file/example.xlsx new file mode 100644 index 0000000..84172c0 Binary files /dev/null and b/raw data file/example.xlsx differ diff --git a/readme.md b/readme.md index 16483bc..7f1d3f7 100644 --- a/readme.md +++ b/readme.md @@ -1,26 +1,13 @@ # wos-selenium -Web of Science spider implemented with selenium. - -This project mimic the click of mouse to query WoS, export plain text file and download results automatically. Since the new WoS does not support requests and posts easily with the help from Pendo.io (nice company though), old platforms using scrapy may no longer be used anymore. - -You can download the code, set up the config in `main.py` and run `main.py` to test the script. You can also follow the code and descriptions in `demo.ipynb`. You should install selenium and set up chromedriver (or firefox driver, etc.) before running the code. - -The logic of this project is: For each task, insert the query and do advanced search; then export the plain text file and move it to the destination path; finally open all results in new windows and download them to the destination path. - -This project supports English and Simplified Chinese for the time. For other languages, please change the function in `crawl.py/switch_language_to_Eng` with the help of development tools on your browser. You are welcome to fork this project and do improvements on it. - -jinyangl - -2022.2 - ---- - - -针对新版WoS的爬虫,通过模拟浏览器鼠标点击进行WoS的批量化查询、下载、处理。 - -使用方法:修改`main.py`中的参数,运行文件。也可以跟随`demo.ipynb`的介绍,以更详细地了解爬虫的代码(~~解决我还没解决的bug~~)。运行前确保selenium和chromedriver(或者firefox driver等)已经安装完毕。 - -代码的逻辑:对于每个任务,首先模拟输入query进行高级检索,然后下载纯文本文件,移动到path中,再打开所有的结果页面,下载到path中。 - -至今为止这大概是第一个针对新版WoS的爬虫,希望可以抛砖引玉。代码逻辑应该比较清晰,如果有其他需求可以自行修改,也欢迎与我交流(~~挖坑~~)。 +## Function +python+selenium爬取web of science(WOS)的信息 +## Base +基于[jinyangl312/wos-selenium](https://github.com/jinyangl312/wos-selenium) +## New additions +在原有的基础上添加了获取一些重要信息的功能 +## Some tips: +我用的是校园网,无需登录 +业余人员,有这个需求,所以基于别人的修改了一下 +多线程时从wos系统上个下载引用(txt)再移动到指定文件夹中的功能会有错误,暂无能力解决 +欢迎交流学习,请邮件 \ No newline at end of file