From 156f36ece231b4d0a6e8b354fabd9f8f5124f14a Mon Sep 17 00:00:00 2001
From: Gbanyan <gbanyan.huang@gmail.com>
Date: Sat, 11 Aug 2018 01:14:30 +0800
Subject: [PATCH] With uptodate API, finish the process of getting content html

---
 Main.py | 50 +++++++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/Main.py b/Main.py
index f9d86e0..7eb287f 100644
--- a/Main.py
+++ b/Main.py
@@ -1,41 +1,45 @@
 import scrapy
 from bs4 import BeautifulSoup
 import requests
-from selenium import webdriver
 import html5lib
 import re
 import sys
+import json
 
 #Def the Uptodate URI for use
 up_search_url = "https://www.uptodate.com/contents/search?search="
 up_api_url = "https://www.uptodate.com/services/app/contents/search/2/json?&language=en&max=10&search="
 up_prefix_url = "https://www.uptodate.com"
+up_content_prefix_url = "https://www.uptodate.com/services/app/contents/topic/"
+up_result_title_list = []
+up_result_url_list = []
 
-def do_uptodate_search_with_gecko(key_word):
-    print(up_search_url + key_word)
-    driver = webdriver.Firefox()
-    driver.get(up_search_url + key_word)
-    html = driver.page_source
-    soup = BeautifulSoup(html, 'html.parser')
-    articles_links = soup.find_all(".search-results")  #Still unable to catch the wanted result
-    for links in articles_links:
-        print(links)
-
-def do_uptodate_search_with_headless(key_word):
-    print(up_search_url + key_word)
-    option = webdriver.ChromeOptions()
-    option.add_argument('headless')
-    driver = webdriver.Chrome(chrome_options=option)
-    driver.get(up_search_url + key_word)
-    html = driver.page_source
-    soup = BeautifulSoup(html, 'html.parser')
-    articles_links = soup.select("#search-results-container")
-    print(articles_links)
 
 def do_uptodate_search_with_uptodate_api(key_word):
     search_results = requests.get(up_api_url + key_word)
-    print(search_results.json())
+    return(search_results.json())
+
+def uptodate_full_content_get(processed_title):
+    full_content = requests.get(up_content_prefix_url + processed_title + "/print/json")
+    return(full_content.json())
+
+def uptodate_title_process(title):
+    hyphen_title = title.replace(" ", "-")
+    return (hyphen_title.lower())
 
 if __name__ == '__main__':
     key_word = input("Please enter your keyword: ")
-    do_uptodate_search_with_uptodate_api(key_word)
+    up_search_result = do_uptodate_search_with_uptodate_api(key_word)
+    for searchResults in up_search_result["data"]["searchResults"]:
+        if searchResults["type"] == "medical":
+            up_result_title_list.append(searchResults["title"])
+            #up_result_url_list.append(searchResults["url"])
+
+    for index, element in enumerate(up_result_title_list):
+        up_result_title_list[index] = uptodate_title_process(element)
+
+    print(up_result_title_list)
+
+    article_full_content = (uptodate_full_content_get(up_result_title_list[0]))
+    print(article_full_content["data"]["printHtml"])
+