With uptodate API, finish the process of getting content html

This commit is contained in:
Gbanyan 2018-08-11 01:14:30 +08:00
parent 2c327a733d
commit 156f36ece2

50
Main.py
View File

@ -1,41 +1,45 @@
import scrapy
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import html5lib
import re
import sys
import json
#Def the Uptodate URI for use
up_search_url = "https://www.uptodate.com/contents/search?search="
up_api_url = "https://www.uptodate.com/services/app/contents/search/2/json?&language=en&max=10&search="
up_prefix_url = "https://www.uptodate.com"
up_content_prefix_url = "https://www.uptodate.com/services/app/contents/topic/"
up_result_title_list = []
up_result_url_list = []
def do_uptodate_search_with_gecko(key_word):
print(up_search_url + key_word)
driver = webdriver.Firefox()
driver.get(up_search_url + key_word)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
articles_links = soup.find_all(".search-results") #Still unable to catch the wanted result
for links in articles_links:
print(links)
def do_uptodate_search_with_headless(key_word):
print(up_search_url + key_word)
option = webdriver.ChromeOptions()
option.add_argument('headless')
driver = webdriver.Chrome(chrome_options=option)
driver.get(up_search_url + key_word)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
articles_links = soup.select("#search-results-container")
print(articles_links)
def do_uptodate_search_with_uptodate_api(key_word):
search_results = requests.get(up_api_url + key_word)
print(search_results.json())
return(search_results.json())
def uptodate_full_content_get(processed_title):
full_content = requests.get(up_content_prefix_url + processed_title + "/print/json")
return(full_content.json())
def uptodate_title_process(title):
hyphen_title = title.replace(" ", "-")
return (hyphen_title.lower())
if __name__ == '__main__':
key_word = input("Please enter your keyword: ")
do_uptodate_search_with_uptodate_api(key_word)
up_search_result = do_uptodate_search_with_uptodate_api(key_word)
for searchResults in up_search_result["data"]["searchResults"]:
if searchResults["type"] == "medical":
up_result_title_list.append(searchResults["title"])
#up_result_url_list.append(searchResults["url"])
for index, element in enumerate(up_result_title_list):
up_result_title_list[index] = uptodate_title_process(element)
print(up_result_title_list)
article_full_content = (uptodate_full_content_get(up_result_title_list[0]))
print(article_full_content["data"]["printHtml"])