본문 바로가기

코딩/기타

(9)

네이버 쇼핑 웹 크롤링 출처 : 패스트캠퍼스 김용담 강사 import requests import bs4 import pandas as pd from selenium.webdriver.common.by import By import time from tqdm.notebook import tqdm #프로세스바 import warnings warnings.simplefilter('ignore') query = "삼성" titles = [] prices = [] review_counts = [] buy_counts = [] published_dates = [] favorites = [] driver = Chrome('./chromedriver') for page_no in tqdm(range(1, 3)): #페이지 수 1~2페이지 ..

plotly 사용 예시(그래프) histogram import plotly.express as px df = px.data.tips() fig = px.histogram(data_frame=df, x="total_bill", nbins=30, #가로 범위 color = "sex") #seaborn에서 hue와 같다. 성별 2종류로 구분하여 표현 fig.show() #show를 해야 표현 가능 import plotly.express as px df = px.data.tips() fig = px.box(df, x="day", y="total_bill", color="smoker", notched=True) fig.show() ganttchart(프로젝트 일정관리) # gantt chart import plotly.express as px ..

seaborn(lineplot, pointplot, barplot, heatmap, pairplot 등) 펭귄 데이터 불러오기 import seaborn as sns sns.set_theme(style='whitegrid') penguins = sns.load_dataset("penguins").dropna() #NAN(비어있는 데이터) 제거 lineplot (1) sns.lineplot(data = penguins, x="body_mass_g", y = "flipper_length_mm", ci = None) #ci는 오차범위 그래프, None은 설정안함 lineplot(2), species별로 그래프 색깔을 다르게 sns.lineplot(data = penguins, x="body_mass_g", y = "bill_length_mm", ci = None, hue = "species") #species별로..

엑셀 데이터 합치기 from glob import glob from tqdm.notebook import tqdm #프로세스bar import os # 엑셀 합치기 stations_files = glob('./data/opinet/*.xls') total = pd.DataFrame() #temp를 누적하여 저장할 total을 만드는데 데이터프레임 형태로 만듦 for file_name in tqdm(stations_files): #프로세스bar 보이기 temp = pd.read_excel(file_name, header = 2) total = pd.concat([total, temp]) #concat 데이터아래 또 다른 데이터 붙이기 total = total.sort_values(by="경유") #내림차순 정렬은 (by="경..

데이터분석 입문(타이타닉) import pandas as pd # 타이타닉 데이터 불러오기 titanic = pd.read_csv("./data/titanic.csv") #나이가 30세 이상인 사람의 이름 보기 titanic.loc[30

판다스(panda) 기본 mask, loc, iloc # pandas 라이브러리를 불러옵니다. pd를 약칭으로 사용합니다. import numpy as np #수치연산 import pandas as pd import matplotlib.pyplot as plt #그래프 그려줌 import seaborn as sns #그래프 그려줌 # s는 1, 3, 5, 6, 8을 원소로 가지는 pandas.Series pd.Series([1,3,5,6,8]) # 12x4 행렬에 1부터 48까지의 숫자를 원소를 가지고, index는 0부터 시작하고, coulmns은 순서대로 X1, X2, X3, X4로 하는 DataFrame 생성 df = pd.DataFrame(data=np.arange(1,49).reshape(12,4),columns=["X..

데이터 크롤링(네이버 코스피 가져오기) import bs4 import requests price_list = [] date_list = [] #제일 끝자리 696페이지에 해당하는 자료를 뽑기 위해서는 href를 뽑아와야 하는데 그게 너무 많음. #그래서 그 위에 해당하는 td class = "pgRR"에 해당하는 정보를 뽑아옴 last_url = source.find_all('td',class_='pgRR')[0].find_all("a")[0]["href"] #td class안에 href가 하나 더 있기 때문에 find_all을 두번 사용함 last_page = int(last_url.split('&page=')[-1]) for page_no in range(1, last_page+1): page_url = f'https://finance...

이전 1 2 다음

티스토리툴바