2018-08-10 18:13:15 +08:00
|
|
|
import scrapy
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
import requests
|
|
|
|
import re
|
|
|
|
import sys
|
2018-08-11 01:14:30 +08:00
|
|
|
import json
|
2018-08-10 18:13:15 +08:00
|
|
|
|
2018-08-10 21:11:35 +08:00
|
|
|
#Def the Uptodate URI for use
|
|
|
|
up_api_url = "https://www.uptodate.com/services/app/contents/search/2/json?&language=en&max=10&search="
|
|
|
|
up_prefix_url = "https://www.uptodate.com"
|
2018-08-12 01:04:25 +08:00
|
|
|
up_content_prefix_url = "https://www.uptodate.com/services/app/contents/topic/" #the content prefix URL
|
2018-08-11 01:14:30 +08:00
|
|
|
up_result_title_list = []
|
|
|
|
up_result_url_list = []
|
2018-08-10 18:13:15 +08:00
|
|
|
|
|
|
|
|
2018-08-10 21:11:35 +08:00
|
|
|
def do_uptodate_search_with_uptodate_api(key_word):
|
|
|
|
search_results = requests.get(up_api_url + key_word)
|
2018-08-11 01:14:30 +08:00
|
|
|
return(search_results.json())
|
|
|
|
|
2018-08-12 01:04:25 +08:00
|
|
|
def uptodate_title_process(title): #經觀察後,串接到內容 API 需要標題,但是標題格式需要做轉換
|
|
|
|
hyphen_title = title.replace(" ", "-") #把空格取代為hyphen
|
|
|
|
hyphen_title = hyphen_title.replace(":", "") #把冒號去掉
|
|
|
|
return (hyphen_title.lower())
|
|
|
|
|
2018-08-11 01:14:30 +08:00
|
|
|
def uptodate_full_content_get(processed_title):
|
|
|
|
full_content = requests.get(up_content_prefix_url + processed_title + "/print/json")
|
|
|
|
return(full_content.json())
|
|
|
|
|
2018-08-12 01:04:25 +08:00
|
|
|
def uptodate_title_filter(title):
|
|
|
|
search_list = ["Approach", "approach", "evaluation", "Evaluation"]
|
|
|
|
hitlist = []
|
|
|
|
for element in search_list:
|
|
|
|
if (title).find(element) == -1:
|
|
|
|
hitlist.append(0)
|
|
|
|
else:
|
|
|
|
hitlist.append(1)
|
|
|
|
|
|
|
|
if 1 in hitlist: return True
|
|
|
|
else: return False
|
|
|
|
|
|
|
|
def ouput_html(item):
|
|
|
|
article_full_content = (uptodate_full_content_get(item))
|
|
|
|
soup = BeautifulSoup(article_full_content["data"]["printHtml"], 'html5lib')
|
|
|
|
prettyHTML = BeautifulSoup(soup.prettify(), 'html5lib')
|
|
|
|
content = str(prettyHTML.find(id="topicContent"))
|
2018-08-12 19:32:11 +08:00
|
|
|
with open("test/" + item + ".html", "w") as file:
|
2018-08-12 01:04:25 +08:00
|
|
|
file.write(content)
|
2018-08-10 18:13:15 +08:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
key_word = input("Please enter your keyword: ")
|
2018-08-11 01:14:30 +08:00
|
|
|
up_search_result = do_uptodate_search_with_uptodate_api(key_word)
|
|
|
|
for searchResults in up_search_result["data"]["searchResults"]:
|
|
|
|
if searchResults["type"] == "medical":
|
2018-08-12 01:04:25 +08:00
|
|
|
if uptodate_title_filter(searchResults["title"]) == True:
|
|
|
|
up_result_title_list.append(searchResults["title"])
|
2018-08-11 01:14:30 +08:00
|
|
|
|
|
|
|
for index, element in enumerate(up_result_title_list):
|
|
|
|
up_result_title_list[index] = uptodate_title_process(element)
|
|
|
|
|
|
|
|
print(up_result_title_list)
|
|
|
|
|
2018-08-12 01:04:25 +08:00
|
|
|
for item in up_result_title_list:
|
|
|
|
ouput_html(item)
|
|
|
|
|
2018-08-11 01:14:30 +08:00
|
|
|
|