From ca29036296e07bae3fbac810337f87b54d6aed45 Mon Sep 17 00:00:00 2001 From: Gbanyan Date: Sun, 12 Aug 2018 01:04:25 +0800 Subject: [PATCH] Complete Process to get Uptodate content and store as HTML --- Main.py | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/Main.py b/Main.py index 7eb287f..1dfb94e 100644 --- a/Main.py +++ b/Main.py @@ -1,16 +1,14 @@ import scrapy from bs4 import BeautifulSoup import requests -import html5lib import re import sys import json #Def the Uptodate URI for use -up_search_url = "https://www.uptodate.com/contents/search?search=" up_api_url = "https://www.uptodate.com/services/app/contents/search/2/json?&language=en&max=10&search=" up_prefix_url = "https://www.uptodate.com" -up_content_prefix_url = "https://www.uptodate.com/services/app/contents/topic/" +up_content_prefix_url = "https://www.uptodate.com/services/app/contents/topic/" #the content prefix URL up_result_title_list = [] up_result_url_list = [] @@ -19,27 +17,49 @@ def do_uptodate_search_with_uptodate_api(key_word): search_results = requests.get(up_api_url + key_word) return(search_results.json()) +def uptodate_title_process(title): #經觀察後,串接到內容 API 需要標題,但是標題格式需要做轉換 + hyphen_title = title.replace(" ", "-") #把空格取代為hyphen + hyphen_title = hyphen_title.replace(":", "") #把冒號去掉 + return (hyphen_title.lower()) + def uptodate_full_content_get(processed_title): full_content = requests.get(up_content_prefix_url + processed_title + "/print/json") return(full_content.json()) -def uptodate_title_process(title): - hyphen_title = title.replace(" ", "-") - return (hyphen_title.lower()) +def uptodate_title_filter(title): + search_list = ["Approach", "approach", "evaluation", "Evaluation"] + hitlist = [] + for element in search_list: + if (title).find(element) == -1: + hitlist.append(0) + else: + hitlist.append(1) + + if 1 in hitlist: return True + else: return False + +def ouput_html(item): + article_full_content = (uptodate_full_content_get(item)) + soup = BeautifulSoup(article_full_content["data"]["printHtml"], 'html5lib') + prettyHTML = BeautifulSoup(soup.prettify(), 'html5lib') + content = str(prettyHTML.find(id="topicContent")) + with open(item + ".html", "w") as file: + file.write(content) if __name__ == '__main__': key_word = input("Please enter your keyword: ") up_search_result = do_uptodate_search_with_uptodate_api(key_word) for searchResults in up_search_result["data"]["searchResults"]: if searchResults["type"] == "medical": - up_result_title_list.append(searchResults["title"]) - #up_result_url_list.append(searchResults["url"]) + if uptodate_title_filter(searchResults["title"]) == True: + up_result_title_list.append(searchResults["title"]) for index, element in enumerate(up_result_title_list): up_result_title_list[index] = uptodate_title_process(element) print(up_result_title_list) - article_full_content = (uptodate_full_content_get(up_result_title_list[0])) - print(article_full_content["data"]["printHtml"]) + for item in up_result_title_list: + ouput_html(item) +