From 156f36ece231b4d0a6e8b354fabd9f8f5124f14a Mon Sep 17 00:00:00 2001 From: Gbanyan Date: Sat, 11 Aug 2018 01:14:30 +0800 Subject: [PATCH] With uptodate API, finish the process of getting content html --- Main.py | 50 +++++++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/Main.py b/Main.py index f9d86e0..7eb287f 100644 --- a/Main.py +++ b/Main.py @@ -1,41 +1,45 @@ import scrapy from bs4 import BeautifulSoup import requests -from selenium import webdriver import html5lib import re import sys +import json #Def the Uptodate URI for use up_search_url = "https://www.uptodate.com/contents/search?search=" up_api_url = "https://www.uptodate.com/services/app/contents/search/2/json?&language=en&max=10&search=" up_prefix_url = "https://www.uptodate.com" +up_content_prefix_url = "https://www.uptodate.com/services/app/contents/topic/" +up_result_title_list = [] +up_result_url_list = [] -def do_uptodate_search_with_gecko(key_word): - print(up_search_url + key_word) - driver = webdriver.Firefox() - driver.get(up_search_url + key_word) - html = driver.page_source - soup = BeautifulSoup(html, 'html.parser') - articles_links = soup.find_all(".search-results") #Still unable to catch the wanted result - for links in articles_links: - print(links) - -def do_uptodate_search_with_headless(key_word): - print(up_search_url + key_word) - option = webdriver.ChromeOptions() - option.add_argument('headless') - driver = webdriver.Chrome(chrome_options=option) - driver.get(up_search_url + key_word) - html = driver.page_source - soup = BeautifulSoup(html, 'html.parser') - articles_links = soup.select("#search-results-container") - print(articles_links) def do_uptodate_search_with_uptodate_api(key_word): search_results = requests.get(up_api_url + key_word) - print(search_results.json()) + return(search_results.json()) + +def uptodate_full_content_get(processed_title): + full_content = requests.get(up_content_prefix_url + processed_title + "/print/json") + return(full_content.json()) + +def uptodate_title_process(title): + hyphen_title = title.replace(" ", "-") + return (hyphen_title.lower()) if __name__ == '__main__': key_word = input("Please enter your keyword: ") - do_uptodate_search_with_uptodate_api(key_word) + up_search_result = do_uptodate_search_with_uptodate_api(key_word) + for searchResults in up_search_result["data"]["searchResults"]: + if searchResults["type"] == "medical": + up_result_title_list.append(searchResults["title"]) + #up_result_url_list.append(searchResults["url"]) + + for index, element in enumerate(up_result_title_list): + up_result_title_list[index] = uptodate_title_process(element) + + print(up_result_title_list) + + article_full_content = (uptodate_full_content_get(up_result_title_list[0])) + print(article_full_content["data"]["printHtml"]) +