728x90
반응형
import selenium # 여기서는 약자를 붙이지 않았다.
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import ElementNotVisibleException
from selenium.webdriver.common.keys import Keys
service = Service(ChromeDriverManager().install())
options = Options()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=service, options=options) # 우선 경로나 옵션을 넣으면 동작 하지 않음 (해결 안됨)
driver.maximize_window() # 창을 최고 사이즈로 함
# 셀레니움 클래스를 이용함.
from selenium.webdriver.common.by import By
import pyperclip
import time
from bs4 import BeautifulSoup
driver.execute_script('window.open("about:blank", "_blank");')
tabs = driver.window_handles
# 첫 번째 탭에서 네이버에 접속한다.
driver.switch_to.window(tabs[0])
#driver.get('http://www.naver.com')
import requests
search_item = 'RPA 파이썬'
#driver.find_element(By.XPATH, '//*[@id="query"]').send_keys(search_item) # 검색어를 적용한다.
#driver.find_element(By.ID, 'search-btn').click() # 검색어를 적용한다.
#driver.find_element(By.XPATH, '//*[@id="lnb"]/div[1]/div/div[1]/div/div[1]/div[7]').click() # 뉴스를 선택한다.
#driver.find_element(By.XPATH, '//*[@id="snb"]/div[1]/div/div[1]/a[2]').click() # 최신순으로 정렬한다.
tbl_list = []
for i in range(2) :
page_num = 0
if i == 1:
page_num = 11
elif i == 0:
page_num = 1
else:
page_num = i * 10 + 1
naver_url = 'https://search.naver.com/search.naver?where=news&sm=tab_pge&sort=1&query=' + search_item + "&start=" + str(page_num)
driver.get(naver_url)
tbl = driver.find_element(By.CLASS_NAME, 'list_news') # 뉴스리스트를 객체로 가져온다.
tbl_row = tbl.find_elements(By.CLASS_NAME, 'news_area') # 뉴스를 객체로 가져온다.
for idx, value in enumerate(tbl_row) :
news_title = value.find_element(By.CLASS_NAME, 'news_tit') # 뉴스 제목을 가져온다.
#print(news_title.text)
news_info = value.find_element(By.CLASS_NAME, 'info_group') # 뉴스 정보를 가져온다.
news_press = news_info.find_element(By.TAG_NAME, 'a')
#print(news_press.text)
news_date = news_info.text.lstrip(news_press.text)
#print(news_date)
news_contents = value.find_element(By.CLASS_NAME, 'news_contents') # 뉴스 정보를 가져온다.
news_link = news_contents.find_elements(By.TAG_NAME, 'a')
print(news_link[0].get_attribute('href'))
tbl_list = tbl_list + [[news_title.text, news_press.text, news_date, news_link[0].get_attribute('href')]]
#print(tbl_list)
df_news = pd.DataFrame.from_records(tbl_list, columns=['뉴스제목','매체','등록일','원문주소'])
# 유사 뉴스 제외
#from fuzzywuzzy import fuzz, process
#df_news['Similarity'] = None
#for i in range(len(df_news)) :
# if i < len(df_news) - 1:
# df_news.loc[i, 'Similarity'] = fuzz.ratio(df_news.iloc[i]['뉴스제목'], df_news.iloc[i+1]['뉴스제목'])
# else :
# df_news.loc[i, 'Similarity'] = 0
#df_news['Similarity_Shift'] = df_news['Similarity'].shift(1)
#df_news['Similarity_Shift'] = df_news['Similarity_Shift'].fillna(0)
#df_news['Similarity_Min'] = df_news.apply(lambda x : min(x['Similarity'], x['Similarity_Shift']), axis = 1)
df_news_final = df_news[:]
for i in range(len(df_news_final)) :
if df_news_final.loc[i, '등록일'] == "" :
df_news_final.loc[i, '등록일'] = df_news_final.loc[i-1, '등록일']
#for i in range(len(df_news_final))[::-1] :
# if df_news_final.loc[i, 'Similarity_Min'] >= 40 :
# df_news_final = df_news_final.drop(i)
#df_news_final = df_news_final.reset_index()
news_html = ""
for i in range(len(df_news_final)) :
news_html = news_html + "<div><p><a href ='\
" + str(df_news_final.loc[i, '원문주소']) + "'>\
" + str(df_news_final.loc[i, '뉴스제목']) + "</a>\
" + str(df_news_final.loc[i, '매체']) + " \
" + str(df_news_final.loc[i, '등록일']) + "</p></div>"
print(news_html)
728x90
반응형
'Program Language > Python' 카테고리의 다른 글
(python) 코스피 종목 데이터 가져오기 (0) | 2023.11.28 |
---|---|
(python) pandas ix -> iloc (0) | 2023.11.28 |
(python) bad operand type for unary +: 'str' (1) | 2023.11.27 |
(python) 정부 입찰 공고 자료 수집 (0) | 2023.11.23 |
(python) 셀레리움 에러 (0) | 2023.11.23 |