Program Language/Python

(python) 네이버 뉴스 크롤링

야곰야곰+책벌레 2023. 11. 27. 17:13
728x90
반응형
import selenium # 여기서는 약자를 붙이지 않았다.
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import ElementNotVisibleException
from selenium.webdriver.common.keys import Keys

service = Service(ChromeDriverManager().install())
options = Options()
options.add_argument("--start-maximized")

driver = webdriver.Chrome(service=service, options=options) # 우선 경로나 옵션을 넣으면 동작 하지 않음 (해결 안됨)
driver.maximize_window() # 창을 최고 사이즈로 함

# 셀레니움 클래스를 이용함.
from selenium.webdriver.common.by import By

import pyperclip
import time
from bs4 import BeautifulSoup

driver.execute_script('window.open("about:blank", "_blank");')
tabs = driver.window_handles
# 첫 번째 탭에서 네이버에 접속한다.
driver.switch_to.window(tabs[0])
#driver.get('http://www.naver.com')

import requests

search_item = 'RPA 파이썬'
#driver.find_element(By.XPATH, '//*[@id="query"]').send_keys(search_item) # 검색어를 적용한다.
#driver.find_element(By.ID, 'search-btn').click() # 검색어를 적용한다.
#driver.find_element(By.XPATH, '//*[@id="lnb"]/div[1]/div/div[1]/div/div[1]/div[7]').click() # 뉴스를 선택한다.
#driver.find_element(By.XPATH, '//*[@id="snb"]/div[1]/div/div[1]/a[2]').click() # 최신순으로 정렬한다.

tbl_list = []

for i in range(2) :
    page_num = 0

    if i == 1:
        page_num = 11
    elif i == 0:
        page_num = 1
    else:
        page_num = i * 10 + 1
    
    naver_url = 'https://search.naver.com/search.naver?where=news&sm=tab_pge&sort=1&query=' + search_item + "&start=" + str(page_num)
    driver.get(naver_url)
    
    tbl = driver.find_element(By.CLASS_NAME, 'list_news') # 뉴스리스트를 객체로 가져온다.
    tbl_row = tbl.find_elements(By.CLASS_NAME, 'news_area') # 뉴스를 객체로 가져온다.
          
    for idx, value in enumerate(tbl_row) :
        news_title = value.find_element(By.CLASS_NAME, 'news_tit') # 뉴스 제목을 가져온다.
        #print(news_title.text)
        
        news_info = value.find_element(By.CLASS_NAME, 'info_group') # 뉴스 정보를 가져온다.
        news_press = news_info.find_element(By.TAG_NAME, 'a')
        #print(news_press.text)
        news_date = news_info.text.lstrip(news_press.text)
        #print(news_date)
        
        news_contents = value.find_element(By.CLASS_NAME, 'news_contents') # 뉴스 정보를 가져온다.
        news_link = news_contents.find_elements(By.TAG_NAME, 'a')
        print(news_link[0].get_attribute('href'))
        
        tbl_list = tbl_list + [[news_title.text, news_press.text, news_date, news_link[0].get_attribute('href')]]
        
#print(tbl_list)

df_news = pd.DataFrame.from_records(tbl_list, columns=['뉴스제목','매체','등록일','원문주소'])

# 유사 뉴스 제외
#from fuzzywuzzy import fuzz, process

#df_news['Similarity'] = None

#for i in range(len(df_news)) :
#    if i < len(df_news) - 1:
#        df_news.loc[i, 'Similarity'] = fuzz.ratio(df_news.iloc[i]['뉴스제목'], df_news.iloc[i+1]['뉴스제목'])
#    else :
#        df_news.loc[i, 'Similarity'] = 0
        
#df_news['Similarity_Shift'] = df_news['Similarity'].shift(1)
#df_news['Similarity_Shift'] = df_news['Similarity_Shift'].fillna(0)
#df_news['Similarity_Min'] = df_news.apply(lambda x : min(x['Similarity'], x['Similarity_Shift']), axis = 1)
df_news_final = df_news[:]

for i in range(len(df_news_final)) :
    if df_news_final.loc[i, '등록일'] == "" :
        df_news_final.loc[i, '등록일'] = df_news_final.loc[i-1, '등록일']
        
#for i in range(len(df_news_final))[::-1] :
#    if df_news_final.loc[i, 'Similarity_Min'] >= 40 :
#        df_news_final = df_news_final.drop(i)
        
#df_news_final = df_news_final.reset_index()

news_html = ""
for i in range(len(df_news_final)) :
    news_html = news_html + "<div><p><a href ='\
    " + str(df_news_final.loc[i, '원문주소']) + "'>\
    " + str(df_news_final.loc[i, '뉴스제목']) + "</a>\
    " + str(df_news_final.loc[i, '매체']) + " \
    " + str(df_news_final.loc[i, '등록일']) + "</p></div>"
    
print(news_html)
728x90
반응형