With uptodate API, finish the process of getting content html

This commit is contained in:
Gbanyan 2018-08-11 01:14:30 +08:00
parent 2c327a733d
commit 156f36ece2

50
Main.py
View File

@ -1,41 +1,45 @@
import scrapy import scrapy
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import requests import requests
from selenium import webdriver
import html5lib import html5lib
import re import re
import sys import sys
import json
#Def the Uptodate URI for use #Def the Uptodate URI for use
up_search_url = "https://www.uptodate.com/contents/search?search=" up_search_url = "https://www.uptodate.com/contents/search?search="
up_api_url = "https://www.uptodate.com/services/app/contents/search/2/json?&language=en&max=10&search=" up_api_url = "https://www.uptodate.com/services/app/contents/search/2/json?&language=en&max=10&search="
up_prefix_url = "https://www.uptodate.com" up_prefix_url = "https://www.uptodate.com"
up_content_prefix_url = "https://www.uptodate.com/services/app/contents/topic/"
up_result_title_list = []
up_result_url_list = []
def do_uptodate_search_with_gecko(key_word):
print(up_search_url + key_word)
driver = webdriver.Firefox()
driver.get(up_search_url + key_word)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
articles_links = soup.find_all(".search-results") #Still unable to catch the wanted result
for links in articles_links:
print(links)
def do_uptodate_search_with_headless(key_word):
print(up_search_url + key_word)
option = webdriver.ChromeOptions()
option.add_argument('headless')
driver = webdriver.Chrome(chrome_options=option)
driver.get(up_search_url + key_word)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
articles_links = soup.select("#search-results-container")
print(articles_links)
def do_uptodate_search_with_uptodate_api(key_word): def do_uptodate_search_with_uptodate_api(key_word):
search_results = requests.get(up_api_url + key_word) search_results = requests.get(up_api_url + key_word)
print(search_results.json()) return(search_results.json())
def uptodate_full_content_get(processed_title):
full_content = requests.get(up_content_prefix_url + processed_title + "/print/json")
return(full_content.json())
def uptodate_title_process(title):
hyphen_title = title.replace(" ", "-")
return (hyphen_title.lower())
if __name__ == '__main__': if __name__ == '__main__':
key_word = input("Please enter your keyword: ") key_word = input("Please enter your keyword: ")
do_uptodate_search_with_uptodate_api(key_word) up_search_result = do_uptodate_search_with_uptodate_api(key_word)
for searchResults in up_search_result["data"]["searchResults"]:
if searchResults["type"] == "medical":
up_result_title_list.append(searchResults["title"])
#up_result_url_list.append(searchResults["url"])
for index, element in enumerate(up_result_title_list):
up_result_title_list[index] = uptodate_title_process(element)
print(up_result_title_list)
article_full_content = (uptodate_full_content_get(up_result_title_list[0]))
print(article_full_content["data"]["printHtml"])