코딩/기타

네이버 쇼핑 웹 크롤링

자본왕김민춘 2023. 3. 22. 21:10
반응형

출처 : 패스트캠퍼스 김용담 강사

import requests
import bs4
import pandas as pd
from selenium.webdriver.common.by import By
import time
from tqdm.notebook import tqdm   #프로세스바
import warnings
warnings.simplefilter('ignore')

query = "삼성"

titles = []
prices = []
review_counts = []
buy_counts = []
published_dates = []
favorites = []

driver = Chrome('./chromedriver')

for page_no in tqdm(range(1, 3)): #페이지 수 1~2페이지

    page_url = f"https://search.shopping.naver.com/search/all?frm=NVSHATC&origQuery={query}&pagingIndex={page_no}&pagingSize=40&productSet=total&query={query}&sort=rel&timestamp=&viewType=list"
    driver.get(page_url)
    time.sleep(1)

    for scroll_down in range(7):
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')  #스크롤을 젤 밑까지 내리는데 FOR를 7번정도 해야 끝까지 내려감
        time.sleep(0.5)

    list_basis = driver.find_element(By.CLASS_NAME, "list_basis")
    item_list = list_basis.find_elements(By.CLASS_NAME, 'basicList_inner__xCM3J')

    items = []
    for i in tqdm(range(len(item_list))):
        item = item_list[i]
        title = item.find_element(By.CLASS_NAME, 'basicList_title__VfX3c')
        titles.append(title.text)

            
        price = item.find_element(By.CLASS_NAME, "price_num__S2p_v").text[:-1].replace(',', '')
        prices.append(price)

        footer = item.find_element(By.CLASS_NAME, 'basicList_etc_box__5lkgg')#.text
        reviews = footer.find_elements(By.CLASS_NAME, 'basicList_num__sfz3h')
        footer_text = footer.text
        try:
            if "구매건수" in footer_text:
                review_counts.append(int(reviews[0].text.replace(',', '')))
                buy_counts.append(int(reviews[1].text.replace(',', '')))
                favorites.append(int(reviews[2].text.replace(',', '')))
                date = footer.find_elements(By.TAG_NAME, 'span')[0].text[4:]
                published_dates.append(date)
                
            else:
                favorites.append(int(reviews[1].text.replace(',', '')))
                review_counts.append(int(reviews[0].text.replace(',', '')))
                date = footer.find_elements(By.CLASS_NAME, 'basicList_etc__LSkN_')[1].text[4:]
                published_dates.append(date)

        except IndexError: ## 리뷰가 아직 충분하지 않아서, 정보가 안뜨는 케이스
            review_counts.append(0)
            favorites.append(int(reviews[0].text.replace(',', '')))
            date = footer.find_elements(By.TAG_NAME, 'span')[0].text[4:]
            published_dates.append(date)
            
print(len(titles), len(prices), len(review_counts), len(published_dates), len(favorites))

result = pd.DataFrame({"제품명" : titles,
                       "가격" : prices,
                       "리뷰수" : review_counts,
                       "등록일" : published_dates,
                       "찜하기" : favorites})
result

result.to_excel(f"naver_shopping({query}).xlsx", index=False)
반응형