'pandas' 데이터 프레임

import pandas as pd

Numpy를 상속하여 구현된 라이브러리

*Numpy의 기능 모두 지원

외부 데이터를 간편하게 읽고 쓰기

DataFrame 데이터 보관 / 정리 / 시각적 분석

*DataFrame: 표형식의 데이터 구조 (데이터 특징을 나타내는 Column과 각 객체를 나타내는 Row로 구성)

int64: 정수 / float64: 소수 / object: 텍스트 / bool: 참, 거짓 / datetime64: 날짜, 시간 / category: 카테고리

Pandas 기본 사용법

import numpy as np #numpy 추가
import pandas as pd #pandas 추가

lists = [['illua', 25],['blog', 30]] # 기본 데이터 생성방법1
ids = ['illua','blog'] #파이썬 사전으로 데이터 생성방법2
ages = [25, 30]
dict = {'id': ids, 'age': ages}
my_data_array = np.array(lists) #numpy로 데이터 생성방법3
list_series = [pd.Series(['illua', 25]), pd.Series(['blog', 30])] #pandas로 데이터 생성방법4

my_dataframe = pd.DataFrame(lists, columns=['id', 'age'], index=['a','b']) #생성된 데이터로 dataframe 정의
my_dataframe #데이터 확인

type(my_dataframe) #타입 확인
my_dataframe.dtypes # 컬럼 데이터 타입 확인 (같은 컬럼 내의 데이터(열)는 모두 같은 자료 형이여야 함)

DataFrame(2차원) 정보 확인 기본

import pandas as pd

#가져올 csv 파일명, row명으로 사용할 컬럼 번호 선택(0...n-1) 또는 컬럼명
info_dataframe=pd.read_csv('info.csv', index_col=0) 
info_dataframe           #(2차원) 정보 확인
info_dataframe.head(2)   #위부터 2줄
info_dataframe.tail(2)   #아래부터 2줄
info_dataframe.shape     #행과 컬럼 수
info_dataframe.columns   #컬럼 정보
info_dataframe.index     #행 정보
info_dataframe.dtypes    #컬럼 타입
info_dataframe.info()    #각 컬럼의 데이터 개수
info_dataframe.describe()#각 컬럼의 통계

#특정 컬럼 기준으로 정렬, descending, dataframe 자체 내에서 정렬된 상태로 재저장
info_dataframe.sort_values(by='정렬할 컬럼명', ascending=False, inplace=True)

DataFrame(2차원) 정보 확인 심화

import pandas as pd

info_dataframe=pd.read_csv('info.csv', index_col=0) 
info_dataframe.loc['행 명', ['컬럼 명1', '컬럼 명2']] #특정 행의 컬럼1, 2의 값 가져오기
info_dataframe.loc['행 명', :]  #특정 행 한줄 값(모든 컬럼 값) 가져오기1
info_dataframe.loc['행 명']     #특정 행 한줄 값(모든 컬럼 값) 가져오기2
info_dataframe.loc[:, '컬럼':'컬럼n']     #컬럼~컬럼 n 까지의 모든 리스트 가져오기1
info_dataframe.loc['컬럼':'컬럼n']        #컬럼~컬럼 n 까지의 모든 리스트 가져오기2
info_dataframe.loc[[True, True, False]] #True인 행 값만 가져오기
                                        #행 값 보다 적은 경우, Default 'False'
    
info_dataframe['컬럼 명'] > 5    #해당 컬럼의 값이 5이상인 값에 True 출력 (index로 활용)
                               #&연산으로 조건 이어붙이기 가능
    
data.loc[data['컬럼 명']>5]      #해당 컬럼의 값이 5이상인 값의 모든 리스트 출력
data.iloc[[1, 2], [3, 4]]      #첫 번째, 두 번째 행의 세 번째, 네 번째 컬럼 값 가져오기

DataFrame(2차원) 추가/제거

import pandas as pd

info_dataframe=pd.read_csv('info.csv', index_col=0) 

info_dataframe.loc['행명'] = ['컬럼1 값','컬럼2 값', '컬럼3 값'] #값 대입하여 변경
info_dataframe.loc['컬럼 명'] = 'Y'            #컬럼 값 모두 'Y'로 변경
info_dataframe['없는 컬럼 또는 행 명'] = '값'     #컬럼 또는 행 추가

#DataFrame의 해당 행명, 행(index), 새로운 dataframe 생성하여 제거 (기존 dataframe 보존)
#axis가 index이면 행, column이면 열
info_dataframe.drop('행 명', axis='index', inplace=False) 

#컬럼명 변경, columns={'바꿀 컬럼명': '변경할 컬럼명'}
info_dataframe.rename(columns={'position': 'Position'}, inplace=True)

info_dataframe.index.name = '행 명'                   #index 이름 지어주기, 겹치지 않게
info_dataframe.set_index('컬럼 명', inplace=True)     #특정 컬럼으로 index변경
info_dataframe['새로운 컬럼 명'] = info_dataframe.index #기존 컬럼을 데이터로 변경

*Series(1차원) 관련 참고 사항

import pandas as pd

dataframe=pd.read_csv('big_data.csv')

dataframe['컬럼명'].unique()              #중복되지 않게 입력된 객체의 개수 확인
dataframe['컬럼명'].value_counts()        #객체별 개수 확인
dataframe['컬럼명'].value_counts().shape  #객체 총 개수만 보기
dataframe['컬럼명'].describe()            #정보 요약 확인

저작자표시 비영리 동일조건 (새창열림)

'학습 log (이론) > python' 카테고리의 다른 글

'통계 분석' (0)	2020.03.04
'Seaborn' (0)	2020.03.04
'시각화' (0)	2020.03.03
'numpy' (0)	2020.03.01
'Anaconda3, Jupyter' 사용툴 (0)	2020.03.01

'pandas' 데이터 프레임

'학습 log (이론) > python' 카테고리의 다른 글

관련글

티스토리툴바