From ca29036296e07bae3fbac810337f87b54d6aed45 Mon Sep 17 00:00:00 2001
From: Gbanyan <gbanyan.huang@gmail.com>
Date: Sun, 12 Aug 2018 01:04:25 +0800
Subject: [PATCH] Complete Process to get Uptodate content and store as HTML

---
 Main.py | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/Main.py b/Main.py
index 7eb287f..1dfb94e 100644
--- a/Main.py
+++ b/Main.py
@@ -1,16 +1,14 @@
 import scrapy
 from bs4 import BeautifulSoup
 import requests
-import html5lib
 import re
 import sys
 import json
 
 #Def the Uptodate URI for use
-up_search_url = "https://www.uptodate.com/contents/search?search="
 up_api_url = "https://www.uptodate.com/services/app/contents/search/2/json?&language=en&max=10&search="
 up_prefix_url = "https://www.uptodate.com"
-up_content_prefix_url = "https://www.uptodate.com/services/app/contents/topic/"
+up_content_prefix_url = "https://www.uptodate.com/services/app/contents/topic/" #the content prefix URL
 up_result_title_list = []
 up_result_url_list = []
 
@@ -19,27 +17,49 @@ def do_uptodate_search_with_uptodate_api(key_word):
     search_results = requests.get(up_api_url + key_word)
     return(search_results.json())
 
+def uptodate_title_process(title): #經觀察後，串接到內容 API 需要標題，但是標題格式需要做轉換
+    hyphen_title = title.replace(" ", "-") #把空格取代為hyphen
+    hyphen_title = hyphen_title.replace(":", "") #把冒號去掉
+    return (hyphen_title.lower())
+
 def uptodate_full_content_get(processed_title):
     full_content = requests.get(up_content_prefix_url + processed_title + "/print/json")
     return(full_content.json())
 
-def uptodate_title_process(title):
-    hyphen_title = title.replace(" ", "-")
-    return (hyphen_title.lower())
+def uptodate_title_filter(title):
+    search_list = ["Approach", "approach", "evaluation", "Evaluation"]
+    hitlist = []
+    for element in search_list:
+        if (title).find(element) == -1:
+            hitlist.append(0)
+        else:
+            hitlist.append(1)
+
+    if 1 in hitlist: return True
+    else: return False
+
+def ouput_html(item):
+    article_full_content = (uptodate_full_content_get(item))
+    soup = BeautifulSoup(article_full_content["data"]["printHtml"], 'html5lib')
+    prettyHTML = BeautifulSoup(soup.prettify(), 'html5lib')
+    content = str(prettyHTML.find(id="topicContent"))
+    with open(item + ".html", "w") as file:
+        file.write(content)
 
 if __name__ == '__main__':
     key_word = input("Please enter your keyword: ")
     up_search_result = do_uptodate_search_with_uptodate_api(key_word)
     for searchResults in up_search_result["data"]["searchResults"]:
         if searchResults["type"] == "medical":
-            up_result_title_list.append(searchResults["title"])
-            #up_result_url_list.append(searchResults["url"])
+            if uptodate_title_filter(searchResults["title"]) == True:
+                up_result_title_list.append(searchResults["title"])
 
     for index, element in enumerate(up_result_title_list):
         up_result_title_list[index] = uptodate_title_process(element)
 
     print(up_result_title_list)
 
-    article_full_content = (uptodate_full_content_get(up_result_title_list[0]))
-    print(article_full_content["data"]["printHtml"])
+    for item in up_result_title_list:
+        ouput_html(item)
+