With uptodate API, finish the process of getting content html
This commit is contained in:
parent
2c327a733d
commit
156f36ece2
50
Main.py
50
Main.py
@ -1,41 +1,45 @@
|
|||||||
import scrapy
|
import scrapy
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import requests
|
import requests
|
||||||
from selenium import webdriver
|
|
||||||
import html5lib
|
import html5lib
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import json
|
||||||
|
|
||||||
#Def the Uptodate URI for use
|
#Def the Uptodate URI for use
|
||||||
up_search_url = "https://www.uptodate.com/contents/search?search="
|
up_search_url = "https://www.uptodate.com/contents/search?search="
|
||||||
up_api_url = "https://www.uptodate.com/services/app/contents/search/2/json?&language=en&max=10&search="
|
up_api_url = "https://www.uptodate.com/services/app/contents/search/2/json?&language=en&max=10&search="
|
||||||
up_prefix_url = "https://www.uptodate.com"
|
up_prefix_url = "https://www.uptodate.com"
|
||||||
|
up_content_prefix_url = "https://www.uptodate.com/services/app/contents/topic/"
|
||||||
|
up_result_title_list = []
|
||||||
|
up_result_url_list = []
|
||||||
|
|
||||||
def do_uptodate_search_with_gecko(key_word):
|
|
||||||
print(up_search_url + key_word)
|
|
||||||
driver = webdriver.Firefox()
|
|
||||||
driver.get(up_search_url + key_word)
|
|
||||||
html = driver.page_source
|
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
|
||||||
articles_links = soup.find_all(".search-results") #Still unable to catch the wanted result
|
|
||||||
for links in articles_links:
|
|
||||||
print(links)
|
|
||||||
|
|
||||||
def do_uptodate_search_with_headless(key_word):
|
|
||||||
print(up_search_url + key_word)
|
|
||||||
option = webdriver.ChromeOptions()
|
|
||||||
option.add_argument('headless')
|
|
||||||
driver = webdriver.Chrome(chrome_options=option)
|
|
||||||
driver.get(up_search_url + key_word)
|
|
||||||
html = driver.page_source
|
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
|
||||||
articles_links = soup.select("#search-results-container")
|
|
||||||
print(articles_links)
|
|
||||||
|
|
||||||
def do_uptodate_search_with_uptodate_api(key_word):
|
def do_uptodate_search_with_uptodate_api(key_word):
|
||||||
search_results = requests.get(up_api_url + key_word)
|
search_results = requests.get(up_api_url + key_word)
|
||||||
print(search_results.json())
|
return(search_results.json())
|
||||||
|
|
||||||
|
def uptodate_full_content_get(processed_title):
|
||||||
|
full_content = requests.get(up_content_prefix_url + processed_title + "/print/json")
|
||||||
|
return(full_content.json())
|
||||||
|
|
||||||
|
def uptodate_title_process(title):
|
||||||
|
hyphen_title = title.replace(" ", "-")
|
||||||
|
return (hyphen_title.lower())
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
key_word = input("Please enter your keyword: ")
|
key_word = input("Please enter your keyword: ")
|
||||||
do_uptodate_search_with_uptodate_api(key_word)
|
up_search_result = do_uptodate_search_with_uptodate_api(key_word)
|
||||||
|
for searchResults in up_search_result["data"]["searchResults"]:
|
||||||
|
if searchResults["type"] == "medical":
|
||||||
|
up_result_title_list.append(searchResults["title"])
|
||||||
|
#up_result_url_list.append(searchResults["url"])
|
||||||
|
|
||||||
|
for index, element in enumerate(up_result_title_list):
|
||||||
|
up_result_title_list[index] = uptodate_title_process(element)
|
||||||
|
|
||||||
|
print(up_result_title_list)
|
||||||
|
|
||||||
|
article_full_content = (uptodate_full_content_get(up_result_title_list[0]))
|
||||||
|
print(article_full_content["data"]["printHtml"])
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user