코딩/기타
네이버 쇼핑 웹 크롤링
자본왕김민춘
2023. 3. 22. 21:10
반응형
출처 : 패스트캠퍼스 김용담 강사
import requests
import bs4
import pandas as pd
from selenium.webdriver.common.by import By
import time
from tqdm.notebook import tqdm #프로세스바
import warnings
warnings.simplefilter('ignore')
query = "삼성"
titles = []
prices = []
review_counts = []
buy_counts = []
published_dates = []
favorites = []
driver = Chrome('./chromedriver')
for page_no in tqdm(range(1, 3)): #페이지 수 1~2페이지
page_url = f"https://search.shopping.naver.com/search/all?frm=NVSHATC&origQuery={query}&pagingIndex={page_no}&pagingSize=40&productSet=total&query={query}&sort=rel×tamp=&viewType=list"
driver.get(page_url)
time.sleep(1)
for scroll_down in range(7):
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') #스크롤을 젤 밑까지 내리는데 FOR를 7번정도 해야 끝까지 내려감
time.sleep(0.5)
list_basis = driver.find_element(By.CLASS_NAME, "list_basis")
item_list = list_basis.find_elements(By.CLASS_NAME, 'basicList_inner__xCM3J')
items = []
for i in tqdm(range(len(item_list))):
item = item_list[i]
title = item.find_element(By.CLASS_NAME, 'basicList_title__VfX3c')
titles.append(title.text)
price = item.find_element(By.CLASS_NAME, "price_num__S2p_v").text[:-1].replace(',', '')
prices.append(price)
footer = item.find_element(By.CLASS_NAME, 'basicList_etc_box__5lkgg')#.text
reviews = footer.find_elements(By.CLASS_NAME, 'basicList_num__sfz3h')
footer_text = footer.text
try:
if "구매건수" in footer_text:
review_counts.append(int(reviews[0].text.replace(',', '')))
buy_counts.append(int(reviews[1].text.replace(',', '')))
favorites.append(int(reviews[2].text.replace(',', '')))
date = footer.find_elements(By.TAG_NAME, 'span')[0].text[4:]
published_dates.append(date)
else:
favorites.append(int(reviews[1].text.replace(',', '')))
review_counts.append(int(reviews[0].text.replace(',', '')))
date = footer.find_elements(By.CLASS_NAME, 'basicList_etc__LSkN_')[1].text[4:]
published_dates.append(date)
except IndexError: ## 리뷰가 아직 충분하지 않아서, 정보가 안뜨는 케이스
review_counts.append(0)
favorites.append(int(reviews[0].text.replace(',', '')))
date = footer.find_elements(By.TAG_NAME, 'span')[0].text[4:]
published_dates.append(date)
print(len(titles), len(prices), len(review_counts), len(published_dates), len(favorites))
result = pd.DataFrame({"제품명" : titles,
"가격" : prices,
"리뷰수" : review_counts,
"등록일" : published_dates,
"찜하기" : favorites})
result
result.to_excel(f"naver_shopping({query}).xlsx", index=False)
반응형