文献批量下载器PyCNKi使用教程
佐佑思维PyCNKi下载器使用教程
PyCNKi下载器源码
(百度链接里有.ipynb格式源码)
一、导入库
from selenium import webdriverfrom selenium.webdriver import ChromeOptionsfrom selenium.webdriver.chrome.options import Optionsimport openpyxlimport reimport timefrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support.select import Selectimport urllib.error
二、打开知网并进行初始设置
#无可视化界面操作def wu_visual(): chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') return chrome_options
def fan_jiance(): option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) #option.add_argument('-kiosk') return optiondef url_error_test(url,bro): try: bro.get(url) print("OK") except urllib.error.HTTPError as e: print(e.code) print(e.reason) except urllib.error.URLError as e: print(e.reason) return e.reason
chrome_options=wu_visual()option=fan_jiance()chrome_path =r'./chromedriver.exe'bro = webdriver.Chrome(executable_path=chrome_path,chrome_options=chrome_options,options=option)
#用火狐的朋友可以把下一行代码的“#”去掉即可#bro = webdriver.Firefox()
bro.maximize_window() #最大化url = r'http://kns.cnki.net' #知网网址bro.get(url)
三、关键词搜索
#模拟输入关键字查询#请选择您需要使用的查询方式,本代码只提供标题查询input_title = bro.find_element_by_id("txt_SearchText")input_title.click()time.sleep(2)key_value = input("请输入你要下载的论文标题:")
input_title.send_keys(key_value)#点击搜索div_search = bro.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[1]/input[2]')div_search.click()time.sleep(1)#点击期刊论文default_1=20bro.find_element_by_xpath("/html/body/div[5]/div[1]/div/ul[1]/li[1]/a/span").click()time.sleep(10)total_num = bro.find_element_by_xpath("/html/body/div[5]/div[1]/div/ul[1]/li[1]/a/em")if int(total_num.text)<=default_1: print("一共搜索到"+total_num.text+"条结果") print("共一页")else: print("一共搜索到" + total_num.text + "条结果") total_page =bro.find_element_by_xpath('/[@id="gridTable"]/div[2]/span[1]') print(total_page.text) num =int(total_page.text[1:-1])
四、选择下载格式及批量下载到几页
print("1:PDF格式2:CAJ格式请输入下载文件的格式对应数字:")load_num = int(input("请输入1 or 2:"))
print("请输入您要下载到第几页码:")
五、开始批量下载
load_page = int(input())while load_page>num or load_page<=0: print("输入页码错误,请重新输入:") load_page = int(input("请输入1 or 2:"))bro_new = webdriver.Chrome(executable_path=chrome_path, chrome_options=chrome_options,options=option)if int(total_num.text)<=default_1: url_link = bro.find_elements_by_xpath('/[@id="gridTable"]/table/tbody/tr/td[2]/a') for link_1 in url_link: count=1 link = url + r'/kcms/detail/detail.aspx?' + link_1.get_attribute("href")[20:] bro_new = webdriver.Chrome(executable_path=chrome_path,chrome_options=chrome_options,options=option) bro_new.get(link) bro_new.maximize_window() # print("编号为"+str(count)+"的论文:"+bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text+"————正在下载") time.sleep(10) if bro_new.find_element_by_xpath('/html/body/div[2]/div').text == "URL参数错误": print("编号为"+str(count)+"的论文:"+bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text+"————论文下载失败") bro_new.quit() count += 1 continue if load_num == 1: bro_new.find_element_by_id('pdfDown').click() time.sleep(10) print("编号为" + str(count) + "的论文:" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————下载成功") count += 1 bro_new.quit() if load_num == 2: bro_new.find_element_by_id('cajDown').click() time.sleep(10) print("编号为" + str(count) + "的论文:" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————下载成功") count += 1 bro_new.quit()else: for ii in range(0,load_page): count=1 url_link = bro.find_elements_by_xpath('/[@id="gridTable"]/table/tbody/tr/td[2]/a') for link_1 in url_link: link = url + r'/kcms/detail/detail.aspx?' + link_1.get_attribute("href")[20:] bro_new = webdriver.Chrome(executable_path=chrome_path,chrome_options=chrome_options,options=option) bro_new.get(link) bro_new.maximize_window() time.sleep(10) if bro_new.find_element_by_xpath('/html/body/div[2]/div').text == "URL参数错误": bro_new.quit() print("编号为" + str(count) + "的论文:" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————论文下载失败") bro_new.quit() count += 1 continue if load_num == 1: bro_new.find_element_by_name('pdfDown').click() time.sleep(10) print("编号为" + str(count) + "的论文:" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————下载成功") count += 1 bro_new.quit() if load_num == 2: bro_new.find_element_by_name('cajDown').click() time.sleep(5) print("编号为" + str(count) + "的论文:" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————下载成功") count += 1 bro_new.quit() bro.find_element_by_xpath('/[@id="PageNext"]').click() time.sleep(10)