Python 기초 공부

Programming/Python

Python 기초 공부 - 6 (Pandas)

Joon09 2021. 3. 8. 19:42

python : 문자열 처리
- 검색, 분리(split), 추출, 대체, 결합, 공백처리
- 문자열의 기본자료구조는 배열 (1차원 배열)

정규표현식 (regular expression) : re => 모든 언어에서 똑같은 방식으로 처리
- 패턴으로 처리

smiles = "C(=N)(N)N.C(=0)(0)0" # 1차원 배열
print(smiles[0])
print(smiles[1])
print(smiles[-1])
print(smiles[1:5])
print(smiles[10:-4])

C
(
0
(=N)
C(=0)

# 단어찾기
s = "That that is is that that is"
print(s.count('t'))
s = s.lower()
print(s.count("that"))
s.find("that") # 단어별
s.find("is")
s.find(" ")

7
4
4

# ASCII code 95 = a
print('C:\\nowhere')
print(r'C:\\nowhere') # 정규표현식 # 3버전은 기본적으로 유니코드
print(u'Hello, world!') # unicode 2.7 버전

C:\nowhere
C:\\nowhere
Hello, world!

# pandas 도 문자열 함수 지원 => 후처리가 편리
import pandas as pd
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                   'Eric Idle', 'Terry Jones', 'Michael Palin'])

monte.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

monte.str.len()

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

monte.str.startswith('T')

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

# 정규표현식
# []: 선택, + : 여러개
monte.str.extract('([A-Za-z]+)', expand=False)

0     Graham
1       John
2      Terry
3       Eric
4      Terry
5    Michael
dtype: object

# ^ : 처음부터, [^] : 부정, $ : 끝
monte.str.findall(r'^[^AEIOU].*[^aeiou]$') # 처음이 AEIOU로 시작하지 않고, 끝이 aeiou가 아닌 것
# 자세한 내용은 아래 링크 참조
# http://pythonstudy.xyz/python/article/401-%EC%A0%95%EA%B7%9C-%ED%91%9C%ED%98%84%EC%8B%9D-Regex

0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

# 문자열 찾기 (정규표현식)
import re
text = "문의사항이 있으면 032-232-3245으로 연락주시기 바랍니다.or 010-456-4658"
# \d : 숫자한개
# {} : 개수
# 패턴을 컴파일
regex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
regex = re.compile(r'(\d{3})-(\d{3}-\d{4})') # 하나의 단위로 {}
matchobj = regex.search(text)
phonenumber = matchobj.group() # 여러개가 나오는 상황
print(phonenumber)

032-232-3245

# boolean 형태로 검색해서 찾기
import numpy as np
s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s4.str.contains('A', na=False)

0     True
1    False
2    False
3     True
4    False
5    False
6     True
7    False
8    False
dtype: bool

from pandas import Series, DataFrame
import pandas as pd
import numpy as np

import re

data={'Dave':'iadslba@naver.com', 'Steve':'steve@gmail.com', 'Rob':'rob', 'Wes':np.nan}
data=Series(data)
print(data)

Dave iadslba@naver.com

Steve steve@gmail.com

Rob rob

Wes NaN

dtype: object

print(data.isnull())
print("네이버", data.str.contains('naver'))

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool
네이버 Dave      True
Steve    False
Rob      False
Wes        NaN
dtype: object

# r : regular expression
# 정규표현식에서의 .은 한개, 진짜 .을 표현하려면 \.
# match의 결과값은 True/False
pattern = r'[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]'
matches = data.str.match(pattern, flags=re.IGNORECASE) # 대소문자 구분없이

print("matches 결과 :", matches)
matches = data.str.findall(pattern, flags=re.IGNORECASE)
print("findall 결과 :", matches)

matches 결과 : Dave      True
Steve     True
Rob      False
Wes        NaN
dtype: object
findall 결과 : Dave     [iadslba@naver.c]
Steve      [steve@gmail.c]
Rob                     []
Wes                    NaN
dtype: object

# one-hot-encoding
s = pd.Series(['a','a|b', np.nan, 'a|c'])
print(s)
# 행은 관측 숫자, 열은 변수
s.str.get_dummies(sep='|')

0      a
1    a|b
2    NaN
3    a|c
dtype: object
a b c
0 1 0 0
1 1 1 0
2 0 0 0
3 1 0 1

# 함수를 매개변수로 전달할 때는 함수 실행이 아니고 함수 위치를 전달하는 것
df = pd.DataFrame(['한글', '미국', '일본?'], columns=['text'])
# 파생변수
df['text_length'] = df['text'].map(len) # 시리즈에 함수 적용
print(df)

text  text_length
0   한글            2
1   미국            2
2  일본?            3

data = {'name':['하늘이','찬호박','우리야','함께가','하성공'],
       'age':[40,50,30,20,70],
       'preScore':[14,28,39,25,32],
       'postScore':[20, 90, 55, 65, 79]}
df = pd.DataFrame(data, columns = ['name','age','preScore','postScore'])
df

# 4줄짜리 코드
print(df['age'].count())
print(df['preScore'].mean())
print(df['preScore'].std())
print(df['preScore'].cumsum())


# --- 한줄로 해결
print()
print("데이터 설명")
print(df['preScore'].describe())
print("데이터 끝")

5
27.6
9.235799911215056
0     14
1     42
2     81
3    106
4    138
Name: preScore, dtype: int64

데이터 설명
count     5.0000
mean     27.6000
std       9.2358
min      14.0000
25%      25.0000
50%      28.0000
75%      32.0000
max      39.0000
Name: preScore, dtype: float64
데이터 끝

print(df['preScore'].var())
print(df['preScore'].std())
print(df['preScore'].skew()) # 왜도 0 좌우대칭 
print(df['preScore'].kurt()) # 첨도 3 정규분포

# 왜도는 치우쳐진 정도(+면 왼쪽으로), 첨도는 뾰족한 정도(+면 뾰족)
# 왜도와 첨도가 안정적이면 정규분포를 가정할 수 있음
# https://m.blog.naver.com/PostView.nhn?blogId=moses3650&logNo=220880815585&proxyReferer=https%3A%2F%2Fwww.google.com%2F

85.30000000000001
9.235799911215056
-0.5110345040062979
0.8509652849263816

import pandas as pd
import numpy as np
df = pd.DataFrame({'two' : pd.Series(np.random.randn(3),   index=['c', 'b', 'a']),
                   'one' : pd.Series(np.random.randn(4),   index=['d', 'b', 'c', 'a']),
                   'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df

df.loc[:'b',:'one']

row = df.iloc[1] # 한 행 데이터
print(row)

two     -0.625601
one     -0.454957
three   -1.473806
Name: b, dtype: float64

column = df['two'] # 한 열 데이터
print(column)

a   -0.946361
b   -0.549165
c    0.009467
d         NaN
Name: two, dtype: float64

판다스는 열중심 - 열끼리의 상관계수

print(df.corr()) # correlation (상관계수 행렬)
# 행과 열의 이름은 열변수 이름
# - : 부적상관 (역상관)
# + : 정적상관
# 상관계수행렬은 정방행렬이면서 대칭행렬 => 고유값 분해 : 고유값(값 3개) + 고유벡터(3X3) 방향축간에 서로 직교
# 여기서 주성분 분석이 나옴! (고유값이 가장 큰 축이 추성분, 작은 것들은 빼는 것이 변수선택법 85% 정도만 남기고 변수 버림)

            two       one    three
two    1.000000  0.687497 -1.00000
one    0.687497  1.000000 -0.55333
three -1.000000 -0.553330  1.00000
            two       one     three
two    0.230574  0.103110 -0.331978
one    0.103110  0.072693 -0.134150
three -0.331978 -0.134150  0.539151

# 상관계수 행렬
# 다수의 변수간 상관관계 파악할 때
# 회귀분석에서 종속변수와 독립변수간 선형관계를 파악하거나
# 독립변수간 다중공선성을 파악하려고 할 때 사용하는 분석기법
# https://rfriend.tistory.com/tag/%EC%83%81%EA%B4%80%EA%B3%84%EC%88%98%20%ED%96%89%EB%A0%AC
# 시각화 방법은 산점도 행렬 / 상관계수 행렬 plot (correlation matrix plot)

import pandas as pd
lst = [[1,2,3,4,5,6,7], [10,15,20,25,50,55,60],[0,0,0,0,0,0,0],[-1,-20,-30,-45,-50,-55,-70]]
df = pd.DataFrame(lst).T
corr = df.corr(method='pearson')
print(corr)

          0         1   2         3
0  1.000000  0.966282 NaN -0.983120
1  0.966282  1.000000 NaN -0.917002
2       NaN       NaN NaN       NaN
3 -0.983120 -0.917002 NaN  1.000000

# 공분산과 상관계수
# 두 개 이상의 서로 연관성을 갖는 자료 값의 집합들이나 혹은 확률 변수들의 관계를 나타내는 값
# heatmap과 같은 서술 통계 방법으로 묘사하거나 결합 확률 분포를 사용하여 정의

import seaborn as sns
import pandas as pd
import scipy as sp
import matplotlib as mpl
import matplotlib.pylab as plt
import numpy as np
%matplotlib inline
sns.set()
sns.set_color_codes()

seaborn = 시각화 라이브러리

X = 10*np.random.randn(1000, 6)
X

array([[ 14.04407801,   7.1103409 ,  -3.86595689, -13.00789291,
        -11.11094085, -12.64901497],
       [ 10.51946269,  -8.41763573,  -5.28104629, -15.12084608,
          0.74894373,   5.62686195],
       [ -6.04795292,  16.0864203 ,   4.05851898,  -2.78020106,
         20.63188984,   4.91760841],
       ...,
       [-16.72089346,  13.8303662 ,  15.92333104,  19.47624157,
          5.07710757,  -9.00793982],
       [  4.14580223,  -1.27679306,  -0.4218097 ,   3.48147417,
          8.9224342 , -10.5354439 ],
       [ -5.50526103,  12.94422897,  17.01463157,  23.3407906 ,
          6.75389859, -22.9241085 ]])

# 공분산을 구하는 방법 두가지
C = np.cov(X, rowvar=0)
(X-X.mean()).T.dot((X-X.mean()))/(len(X)-1)

array([[105.49965872,  -0.83937216,   0.11596065,  -0.7921198 ,
         -1.87393213,   2.21192576],
       [ -0.83937216, 104.94451983,   2.50883727,  -0.75557596,
          0.13060117,  -2.48189517],
       [  0.11596065,   2.50883727,  99.67975659,  -0.1115056 ,
         -1.11046052,   2.67437667],
       [ -0.7921198 ,  -0.75557596,  -0.1115056 ,  99.26666756,
         -3.78445444,   1.6691171 ],
       [ -1.87393213,   0.13060117,  -1.11046052,  -3.78445444,
         98.98425047,   0.75223531],
       [  2.21192576,  -2.48189517,   2.67437667,   1.6691171 ,
          0.75223531,  97.30716689]])

plt.figure(figsize=(12,6))
plt.imshow(C, interpolation="none")
plt.colorbar()
plt.grid(False)

# correlation 상관도
# 두 확률변수의 선형관계를 나타내는 척도 (pearson correlation)
# correlation matrix
# rowvar=0 => 행으로 쌓기
R = np.corrcoef(X, rowvar=0)
R

array([[ 1.        , -0.00797718,  0.00113079, -0.0077404 , -0.01833772,
         0.02183095],
       [-0.00797718,  1.        ,  0.02452952, -0.00740281,  0.0012814 ,
        -0.02456016],
       [ 0.00113079,  0.02452952,  1.        , -0.00112096, -0.01117935,
         0.0271548 ],
       [-0.0077404 , -0.00740281, -0.00112096,  1.        , -0.03817847,
         0.01698293],
       [-0.01833772,  0.0012814 , -0.01117935, -0.03817847,  1.        ,
         0.00766475],
       [ 0.02183095, -0.02456016,  0.0271548 ,  0.01698293,  0.00766475,
         1.        ]])

plt.figure(figsize=(12,6))
plt.imshow(R, interpolation='none')
plt.colorbar()
plt.grid(False)

import statsmodels.api as sm
data = sm.datasets.get_rdataset('anscombe')
df = data.data
df

plt.figure(figsize=(20,6))
plt.subplot(221)
sns.regplot(x='x1', y='y1', data=df)
plt.subplot(222)
sns.regplot(x='x2', y='y2', data=df)
plt.subplot(223)
sns.regplot(x='x3', y='y3', data=df)
plt.subplot(224)
sns.regplot(x='x4', y='y4', data=df)
plt.show()

x = np.linspace(-0.5, .5, 100)
y = x**2
print(np.corrcoef(x,y))
plt.scatter(x,y)

[[1.00000000e+00 2.52533867e-16]
[2.52533867e-16 1.00000000e+00]]
<matplotlib.collections.PathCollection at 0x26aab6586a0>

# rank-based correlation

x = "TGGAGGCAATGGCGGCCAGCA"
y = "TGAGGGCCGGCGAGAATGGCA"
mapping = dict(zip(['A','T','G','C'], range(4)))
x = [mapping[i] for i in x]
y = [mapping[i] for i in y]

sp.stats.spearmanr(x,y) # 스피어만의 순위 상관계수 ( 두 변수를 순위로 변환 후 순위에 대해 상관계쑤 구함)

SpearmanrResult(correlation=-0.07786173874795632, pvalue=0.7372715629763089)

sp.stats.kendalltau(x,y) # 켄달 토우의 순위 상관계수 ( 순위가 같은 짝 concordant의 수를 이용하여 계산 )

KendalltauResult(correlation=-0.06645435190240834, pvalue=0.7305031620485603)

print(df.cov()) # 공분산 행렬  (x-xbar)*(y-ybar)/ (n-1) *자유도에서 1을 빼주는 건 자기 자신은 선택이 아니기 때문

df1 = pd.DataFrame({'col':['foo', 0, np.nan]})
df2 = pd.DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0])
df3 = pd.DataFrame({'col':[1, 2, 3]}, index=[2,1,0]) 
df2

print(df3.sort_values(by=['col']))

   col
2    1
1    2
0    3

print(df2.sort_index())

   col
0  foo
1    0
2  NaN

# 인덱스
# index 행
# columns 열

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
iris = sns.load_dataset('iris')
iris.sepal_length[:20].plot(kind='bar', rot=0) # rot = 글씨 rotate
plt.show()

iris.head()

# pandas io in pandas 검색
names = ['한국성','공하자','희망이','꿈꾼다','아리랑']
births = [25, 30, 38, 28, 31]
BabyDataSet = list(zip(names, births))
print(BabyDataSet)
df = pd.DataFrame(data = BabyDataSet, columns=['Names','Births'])
print(df)
# 인덱스 저장하면 열로 나타남, 그래서 index=False
# 로딩할 때 인덱스로 열 지정이 가능
# header = 열 이름을 저장할지
df.to_csv('births2020.csv', index=False, header=True, encoding = "UTF-8") # 윈도우포맷이랑 리눅스포맷 다름름
Location = './births2020.csv'
df = pd.read_csv(Location)
print(df)
df = pd.read_csv(Location, names=['Names', 'Births'], encoding="UTF-8")

[('한국성', 25), ('공하자', 30), ('희망이', 38), ('꿈꾼다', 28), ('아리랑', 31)]
  Names  Births
0   한국성      25
1   공하자      30
2   희망이      38
3   꿈꾼다      28
4   아리랑      31
  Names  Births
0   한국성      25
1   공하자      30
2   희망이      38
3   꿈꾼다      28
4   아리랑      31

json 키-값 형태의 noSQL

pickle 메모리에 저장된 형태 그대로 올리고 받고

기본적인 csv 불러와서 기초 분석 진행

# 행이름
pim = pd.read_csv("diab.csv", index_col=0) # unnamed 없애주는 index_col=0
pim

diab.csv

0.01MB

pim.describe()

print(pim.apply(type))

pim.applymap(type).head(1)

pim.dtypes

npreg      int64
glu        int64
bp         int64
skin       int64
bmi      float64
ped      float64
age        int64
type      object
dtype: object

print("데이터갯수", pim.count())

데이터갯수 npreg    332
glu      332
bp       332
skin     332
bmi      332
ped      332
age      332
type     332
dtype: int64

print(pim.shape)

(332, 8)

print(pim[pim["bmi"]<30].shape)
# bmi가 30보다 작은 행 갯수

(118, 8)

pim.mean() # 열별로 평균

npreg      3.484940
glu      119.259036
bp        71.653614
skin      29.162651
bmi       33.239759
ped        0.528389
age       31.316265
dtype: float64

import matplotlib.pyplot as plt
pim["bmi"].hist() # barplot(이산적) / histogram(부동소수점 float)
plt.show()
pim["bmi"].plot(kind="kde") # interpolation (보간법)
plt.show()

pim.head()

pim.groupby("type") # DataFrameGroupBy 내부적으로 표현

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001A718F64888>

# 집계함수 sum(), mean(), median(), max(), min(), last(), first()
pim.groupby("type").mean()

pim.groupby("type").count()

group_by_type = pim.groupby("type")
group_by_type.mean()
group_by_type.std()

group_by_type.agg([np.mean, np.std])

print(np.mean(pim[pim["type"]=="Yes"]["skin"]))

32.88990825688074

print(np.std(pim[pim["type"]=="Yes"]["skin"]))

9.024268451930087

weather = pd.read_csv("we_2012.csv") 
weather

we_2012.csv

0.49MB

weather_2012_final = pd.read_csv("we_2012.csv")
weather_2012_final.head()

index 지정하면 검색이 100배 빨라짐
시간데이터에서 Datetime Index를 만드는 방법

# index 지정하면 검색이 100배 빨라짐
# 시간데이터에서 Datetime Index를 만드는 방법
date_range() # 일정한 주기와 기간을 정해서 생성할 때
to_datetime() # 기존에 있는 시간 데이터를 변환

index = pd.to_datetime(weather_2012_final["Date/Time"])
weather_2012_final.index = index
weather_2012_final.head() # 똑같이 보이는데 질이 다름!

del(weather_2012_final["Date/Time"])
weather_2012_final.head()

weather_2012_final.shape

(8784, 7)

bigFilePath = "we_2012.csv"
# 날짜 인덱스로 자동 변환해주고
# 대량의 데이터인 경우 chunksize
chunker = pd.read_csv(bigFilePath, chunksize=1000, index_col="Date/Time", encoding="UTF-8")
weather_2012_final = pd.concat([x for x in chunker], ignore_index=True)

print(weather_2012_final.describe())
weather_2012_final.dtypes

weather_2012_final['Temp (C)'].plot(figsize=(30,12))
weather_2012_final.boxplot()

print("결측치", weather_2012_final.count())

print(weather_2012_final.isnull().values.sum()) # null의 개수

print(weather_2012_final.isnull().any())

Temp (C)              False
Dew Point Temp (C)    False
Rel Hum (%)           False
Wind Spd (km/h)       False
Visibility (km)       False
Stn Press (kPa)       False
Weather               False
dtype: bool

weather_2012_final = weather_2012_final.dropna(axis=1, how='any') # 열방향 : 행삭제

f = lambda x: x.max() -x.min()
print("함수 객체의 열 적용 (행방향)", weather_2012_final.apply(f)) # 문자열 포함이라 계산이 안됨

weather_2012_final.dtypes
weather_2012_final_num = weather_2012_final.iloc[:,:6]
print("함수 객체의 열 적용 (행방향)", weather_2012_final_num.apply(f))

함수 객체의 열 적용 (행방향) Temp (C)              56.30
Dew Point Temp (C)    52.90
Rel Hum (%)           82.00
Wind Spd (km/h)       83.00
Visibility (km)       48.10
Stn Press (kPa)        6.13
dtype: float64

# ptp (point to point) : min-max
print("함수 객체의 열 적용 (행방향)", weather_2012_final_num.apply(np.ptp))

import glob
import os
import pandas as pd
filePathList = glob.glob("./same__files/*.csv")
print(filePathList)
temp = os.path.basename(filePathList[0]) # 파일 확장자
print(temp)

['./same__files\\1763.csv', './same__files\\1764.csv', './same__files\\1765.csv', './same__files\\1766.csv', './same__files\\1767.csv', './same__files\\1768.csv', './same__files\\1769.csv', './same__files\\1770.csv', './same__files\\1771.csv', './same__files\\1772.csv'] 1763.csv

temp = os.path.splitext(temp)[0]
print(temp)
os.path.splitext(temp)

1763
('1763', '')

# data_1763이라는 변수로 리딩 : vars() 메모리에 있는 변수들
for i in range(0, len(filePathList)):
    temp = os.path.basename(filePathList[i])
    temp = os.path.splitext(temp)[0]
    vars()["data_" + str(temp)] = pd.read_csv(filePathList[i])

print(data_1763.head(3))
print(data_1770.shape)

   ITE00100554  17630101  TMAX  -36  Unnamed: 4 Unnamed: 5  E  Unnamed: 7
0  ITE00100554  17630101  TMIN  -50         NaN        NaN  E         NaN
1  ITE00100554  17630102  TMAX  -26         NaN        NaN  E         NaN
2  ITE00100554  17630102  TMIN  -40         NaN        NaN  E         NaN
(729, 8)

df = pd.read_csv("./sales.csv")
df

sales.csv

0.00MB

df.dtypes

Customer Number     int32
Customer Name      object
2016               object
2017               object
Percent Growth     object
Jan Units          object
Month               int64
Day                 int64
Year                int64
Active             object
dtype: object

1. 정수 => 부동소수점으로 인식
2. $를 제거
3. %를 제거
4. 숫자에 문자 제거
5. Y:1, N:0 으로 boolean형으로 변경

# astype으로 형변환 진행(int형으로)
df['Customer Number'] = df['Customer Number'].astype("int")

df['2016'] = df['2016'].str.replace("$","")
df['2016'] = df['2016'].str.replace(",","")
df['2016'] = df['2016'].astype('float')

df['2017'] = df['2017'].map(lambda x: x.replace("$",""))
df['2017'] = df['2017'].map(lambda x :x.replace(",",""))
df['2017'] = df['2017'].astype('float')
#df['Percent Growth'] = df['Percent Growth'].str.replace("%","")
def convert_percent(val):
    new_val = val.replace('%','')
    return float(new_val) / 100
df['Percent Growth'] = df['Percent Growth'].map(convert_percent)

df['Active'] = df['Active'] == 'Y'

df['Jan Units'] = pd.to_numeric(df['Jan Units'],
                               errors='coerce') # ignore, raise

df.dtypes

Customer Number      int32
Customer Name       object
2016               float64
2017               float64
Percent Growth     float64
Jan Units          float64
Month                int64
Day                  int64
Year                 int64
Active                bool
dtype: object

저작자표시 (새창열림)