ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • 사이킷런(sklearn)을 이용한 머신러닝 - 2 (xgboost)
    Machine Learning 2021. 3. 11. 13:23
    반응형

    코드 사용전 꼭 설치바랍니다.

     


    Anaconda prompt 에서 진행


    conda install -c conda-forge graphviz
    conda install -c conda-forge python-graphviz
    pip install pydot
    pip install pydotplus


    %matplotlib inline
    from sklearn.datasets import load_iris
    from sklearn.model_selection import cross_val_score
    from sklearn import tree
    clf = tree.DecisionTreeClassifier(random_state=0)
    iris = load_iris()
    clf = clf.fit(iris.data, iris.target)
    tree.plot_tree(clf)

    [Text(167.4, 199.32, 'X[3] <= 0.8\ngini = 0.667\nsamples = 150\nvalue = [50, 50, 50]'),
     Text(141.64615384615385, 163.07999999999998, 'gini = 0.0\nsamples = 50\nvalue = [50, 0, 0]'),
     Text(193.15384615384616, 163.07999999999998, 'X[3] <= 1.75\ngini = 0.5\nsamples = 100\nvalue = [0, 50, 50]'),
     Text(103.01538461538462, 126.83999999999999, 'X[2] <= 4.95\ngini = 0.168\nsamples = 54\nvalue = [0, 49, 5]'),
     Text(51.50769230769231, 90.6, 'X[3] <= 1.65\ngini = 0.041\nsamples = 48\nvalue = [0, 47, 1]'),
     Text(25.753846153846155, 54.359999999999985, 'gini = 0.0\nsamples = 47\nvalue = [0, 47, 0]'),
     Text(77.26153846153846, 54.359999999999985, 'gini = 0.0\nsamples = 1\nvalue = [0, 0, 1]'),
     Text(154.52307692307693, 90.6, 'X[3] <= 1.55\ngini = 0.444\nsamples = 6\nvalue = [0, 2, 4]'),
     Text(128.76923076923077, 54.359999999999985, 'gini = 0.0\nsamples = 3\nvalue = [0, 0, 3]'),
     Text(180.27692307692308, 54.359999999999985, 'X[2] <= 5.45\ngini = 0.444\nsamples = 3\nvalue = [0, 2, 1]'),
     Text(154.52307692307693, 18.119999999999976, 'gini = 0.0\nsamples = 2\nvalue = [0, 2, 0]'),
     Text(206.03076923076924, 18.119999999999976, 'gini = 0.0\nsamples = 1\nvalue = [0, 0, 1]'),
     Text(283.2923076923077, 126.83999999999999, 'X[2] <= 4.85\ngini = 0.043\nsamples = 46\nvalue = [0, 1, 45]'),
     Text(257.53846153846155, 90.6, 'X[1] <= 3.1\ngini = 0.444\nsamples = 3\nvalue = [0, 1, 2]'),
     Text(231.7846153846154, 54.359999999999985, 'gini = 0.0\nsamples = 2\nvalue = [0, 0, 2]'),
     Text(283.2923076923077, 54.359999999999985, 'gini = 0.0\nsamples = 1\nvalue = [0, 1, 0]'),
     Text(309.04615384615386, 90.6, 'gini = 0.0\nsamples = 43\nvalue = [0, 0, 43]')]

     

    트리 : 정보이득 : 불순도 : {"gini" 계수, entropy}

    # hyper parameter -> GridSearchCV

    의사결정 트리를 만드는 이유 ( 기준점은 분산이 제일큰놈 )

    - 비교를 적게하기 위하여

    과적합, 변수의 순서를 달리하면 결과도 달라진다는게 문제

    • max_depth : 몇개까지 나눌것인가
    • min_saples_split : 노드를 나누기위한 최소한의 개수 2개! 끝단은 leaf
    • min_sample_leaf : 하나의 노드가 되기 위한 최소한의 수
    • hyper parameter 조합을 만들어서 테스트 : GridSearchCV
    cross_val_score(clf, iris.data, iris.target, cv=10)

    array([1.        , 0.93333333, 1.        , 0.93333333, 0.93333333,
           0.86666667, 0.93333333, 1.        , 1.        , 1.        ])

    print(clf.get_n_leaves()) # 리프수

    9

    clf.get_depth() # 최대 연결고리수

    5

    clf.get_params() # default 값

    {'ccp_alpha': 0.0,
     'class_weight': None,
     'criterion': 'gini',
     'max_depth': None,
     'max_features': None,
     'max_leaf_nodes': None,
     'min_impurity_decrease': 0.0,
     'min_impurity_split': None,
     'min_samples_leaf': 1,
     'min_samples_split': 2,
     'min_weight_fraction_leaf': 0.0,
     'presort': 'deprecated',
     'random_state': 0,
     'splitter': 'best'}

    print(iris.data.shape) # 행열 갯수
    print(iris.feature_names)  # 열이름

    (150, 4)
    ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

     

    import pandas as pd
    data = pd.DataFrame(iris.data)
    print(data.head())

         0    1    2    3
    0  5.1  3.5  1.4  0.2
    1  4.9  3.0  1.4  0.2
    2  4.7  3.2  1.3  0.2
    3  4.6  3.1  1.5  0.2
    4  5.0  3.6  1.4  0.2

     

    clf.predict(data.iloc[1:150,:])

    array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

     


    Pipeline 설계

    from sklearn.pipeline import make_pipeline
    from sklearn.naive_bayes import MultinomialNB # bayes
    from sklearn.preprocessing import Binarizer # 경계값을 기준으로 0,1 나누는것
    pipe = make_pipeline(Binarizer(), MultinomialNB())
    print(pipe.steps[0])
    print(pipe[0])
    
    #pipe['reduce_dim']

    ('binarizer', Binarizer())
    Binarizer()

    from sklearn.pipeline import Pipeline
    from sklearn.svm import SVC # support vector classifier
    from sklearn.decomposition import PCA # principle componet analysis
    estimators = [('reduce_dim', PCA()), ('clf', SVC())] # 이름지정
    pipe = Pipeline(estimators)
    pipe

    Pipeline(steps=[('reduce_dim', PCA()), ('clf', SVC())])

    print(pipe.steps[0]) # 0번은 PCA
    print(pipe[0])

    ('reduce_dim', PCA())
    PCA()

    pipe.set_params(clf__C=10) # 매개변수 전달

    Pipeline(steps=[('reduce_dim', PCA()), ('clf', SVC(C=10))])



    GridSearchCV

    import numpy as np
    def make_data(N, err=1.0,rseed=1):
        rng = np.random.RandomState(rseed)
        X = rng.rand(N,1) **2
        y = 10-1. / (X.ravel() + 0.1)
        if err > 0:
            y += err * rng.randn(N)
        return X,y
    X, y = make_data(40)
    print(type(X))

    <class 'numpy.ndarray'>

    from sklearn.model_selection import GridSearchCV
    from sklearn.preprocessing import PolynomialFeatures # 다차원축소
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LinearRegression
    import numpy as np
    
    # 파이프 라인 리턴
    def PolynomialRegression(degree=2, **kwargs): # dict 변동 매개변수
        return make_pipeline(PolynomialFeatures(degree),
                            LinearRegression(**kwargs))
    
    param_grid = {'polynomialfeatures__degree': np.arange(21), #21
                  'linearregression__fit_intercept':[True, False], #2
                  'linearregression__normalize':[True, False]} #2 총 84개 조합테스트
    param_grid

    {'polynomialfeatures__degree': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20]),
     'linearregression__fit_intercept': [True, False],
     'linearregression__normalize': [True, False]}

     

    grid = GridSearchCV(PolynomialRegression(), param_grid,cv=7) # CV = Cross Validation 교차검증
    grid.fit(X,y)

    GridSearchCV(cv=7,
                 estimator=Pipeline(steps=[('polynomialfeatures',
                                            PolynomialFeatures()),
                                           ('linearregression',
                                            LinearRegression())]),
                 param_grid={'linearregression__fit_intercept': [True, False],
                             'linearregression__normalize': [True, False],
                             'polynomialfeatures__degree': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
           17, 18, 19, 20])})


    best 조합

    # 총 84개 조합테스트
    grid.best_params_ # 

    {'linearregression__fit_intercept': False,
     'linearregression__normalize': True,
     'polynomialfeatures__degree': 4}

    grid.best_estimator_ # BEST 조합!

    Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=4)),
                    ('linearregression',
                     LinearRegression(fit_intercept=False, normalize=True))])

    grid.best_score_ # 제일 잘나온 확률

    0.8972710305736544

    import matplotlib.pyplot as plt
    model = grid.best_estimator_
    X_test = np.linspace(-0.1,1.1,500)[:,None]
    plt.scatter(X.ravel(), y) # ravel = 평평하게 1차원
    lim = plt.axis()
    y_test = model.fit(X,y).predict(X_test)
    plt.plot(X_test.ravel(), y_test);
    plt.axis(lim);

    import numpy as np
    import pandas as pd
    dataset= [10,12,12,13,12,11,14,13,15,10,10,10,100,12,14,13,102,105,123,125,
              12,10, 10,11,12,15,12,13,12,11,14,13,15,10,15,12,10,14,13,15,10] 
    outliers = []
    def detect_outlier(data_1):
        threshold=3
        mean_1 = np.mean(data_1)
        std_1 = np.std(data_1)
        for y in data_1:
            z_score= (y - mean_1)/std_1 # Z점수
            if np.abs(z_score) > threshold: # threshold = 문지방 : 경계값
                outliers.append(y)
        return outliers
    outlier_datapoints = detect_outlier(dataset)
    print(outlier_datapoints)

    [123, 125]

    # scale = z점수
    # robust_scale 평균 : median( 위치적 중위수 ) /IQR
    # -1 ~ 1 
    from sklearn.preprocessing import scale, robust_scale, minmax_scale , maxabs_scale
    print((np.arange(10, dtype = np.float) -3))
    x = (np.arange(10, dtype = np.float)-3).reshape(-1,1)
    
    df = pd.DataFrame(np.hstack([x, scale(x), robust_scale(x),
                                minmax_scale(x), maxabs_scale(x)]),
                     columns = ["x", "scale(x)", "robust_scale(x)","minmax_scale(x)","maxabs_scale(x)"])
    df.plot()

    [-3. -2. -1.  0.  1.  2.  3.  4.  5.  6.]

     


    정규화 이유

    정규화는 왜하는가? 변수 기여도를 동일하게 하기위해서

     

    # 분포는 동일
    import seaborn as sns
    from sklearn.datasets import load_iris
    iris = load_iris()
    print(type(iris))
    data1 = iris.data
    data2 = scale(iris.data)
    print("전처리전 평균:", np.mean(data1, axis=0))
    print("전처리전 std:", np.std(data1, axis=0))
    print("전처리후 mean :", np.mean(data2, axis=0))
    print("전처리후 std", np.std(data2, axis=0))
    sns.jointplot(data1[:,0], data1[:,1])
    plt.show()
    sns.jointplot(data2[:,0], data2[:,2])
    plt.show()

    전처리전 평균: [5.84333333 3.05733333 3.758      1.19933333]
    전처리전 std: [0.82530129 0.43441097 1.75940407 0.75969263]
    전처리후 mean : [-1.69031455e-15 -1.84297022e-15 -1.69864123e-15 -1.40924309e-15]
    전처리후 std [1. 1. 1. 1.]


    정규분포로 변환

    from sklearn.preprocessing import StandardScaler
    scaler =StandardScaler() # 인스턴스 과정이 필요함
    scaler.fit(data1)
    data2 = scaler.transform(data1) # transformer(변환기), estimator(추정기)
    data1.std(), data2.std()

    (1.9738430577598278, 1.0)


    One-hot Encoder

    from sklearn.preprocessing import OneHotEncoder
    ohe = OneHotEncoder()
    X = np.array([[0],[1],[2]])
    X
    ohe.fit(X)
    #print(ohe.n_values_, ohe.feature_indices_, ohe.active_features_)
    ohe.categories_

    [array([0, 1, 2])]

    print(ohe.transform(X).toarray())

    [[1. 0. 0.]
     [0. 1. 0.]
     [0. 0. 1.]]

    # 입력 3자리 -> 10자리로 변경
    X = np.array([[0,0,4],[1,1,0],[0,2,1],[1,0,2],[1,1,3]])
    ohe = OneHotEncoder()
    ohe.fit(X)
    print(ohe.transform(X).toarray()) # 리스트를 보여줘라

    [[1. 0. 1. 0. 0. 0. 0. 0. 0. 1.]
     [0. 1. 0. 1. 0. 1. 0. 0. 0. 0.]
     [1. 0. 0. 0. 1. 0. 1. 0. 0. 0.]
     [0. 1. 1. 0. 0. 0. 0. 1. 0. 0.]
     [0. 1. 0. 1. 0. 0. 0. 0. 1. 0.]]


    Label Encoder

    # 라벨인코더~
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    le.fit([1,2,2,6])
    le.classes_

    array([1, 2, 6])

    le.transform([1,1,2,6])

    array([0, 0, 1, 2], dtype=int64)

    le.inverse_transform([0,0,1,2])

    array([1, 1, 2, 6])

    # 다음 데이터를 인코딩하시오.
    a = ["서울","서울","대전","부산"]
    le.fit(a)
    le.classes_
    a_a = le.transform(a)
    a_a

    array([2, 2, 0, 1], dtype=int64)

    le.inverse_transform(a_a)

    array(['서울', '서울', '대전', '부산'], dtype='<U2')

     


    Dictionary vectorizer

    # dict ~
    from sklearn.feature_extraction import DictVectorizer
    v = DictVectorizer(sparse=False)
    D = [{'foo' : 1, 'bar' : 2}, {'foo' : 3, 'baz' : 1}]
    X = v.fit_transform(D)
    X
    #      'bar' , 'baz', 'foo'
    #        2       0      1
    #        0       1      3

    array([[2., 0., 1.],
           [0., 1., 3.]])

    v.feature_names_

    ['bar', 'baz', 'foo']

    v.inverse_transform(X)

    [{'bar': 2.0, 'foo': 1.0}, {'baz': 1.0, 'foo': 3.0}]

     


    결측치 처리

    # 결측치 처리
    from sklearn.impute import SimpleImputer
    imp_mean = SimpleImputer(missing_values = np.nan, strategy = 'mean') # median, most_frequency(최빈값)
    imp_mean.fit([[7,2,3],[4,np.nan,6],[10,5,9]])
    X = [[np.nan,2,3], [4,np.nan,6], [10,np.nan,9]]
    print(imp_mean.transform(X))

    [[ 7.   2.   3. ]
     [ 4.   3.5  6. ]
     [10.   3.5  9. ]]

     

    ** tip

    이렇게 평균값으로 대입가능하고, 머신러닝을 통해서 예측한 결과값을 다시 독립변수로 사용해도 됩니다.

     

    he = OneHotEncoder()
    ohe.fit([["서울"],["서울"],["대전"],["부산"]])
    ohe.transform([["서울"],["서울"]]).toarray()

    array([[0., 0., 1.],
           [0., 0., 1.]])

    X = np.arange(6).reshape(3,2)
    X

    array([[0, 1],
           [2, 3],
           [4, 5]])

    # [1,a,b,a^2,ab,b^2] -> 비선형회귀
    poly = PolynomialFeatures(2)
    poly.fit_transform(X)
    

    array([[ 1.,  0.,  1.,  0.,  0.,  1.],
           [ 1.,  2.,  3.,  4.,  6.,  9.],
           [ 1.,  4.,  5., 16., 20., 25.]])


    Ensemble(앙상블)

    RandomForest : DT를 여러개의 모델로 구축해서
    연속형 : 결과값의 평균으로 예측
    이산형 : 결과값의 투표를 통해서 결정

    from sklearn.datasets import make_classification
    X,y = make_classification(1000)
    
    from sklearn.ensemble import RandomForestClassifier
    rf = RandomForestClassifier(n_estimators=30) # 
    rf.fit(X,y)

    RandomForestClassifier(n_estimators=30)

    print("Accuracy:\t", (y == rf.predict(X)).mean())

    Accuracy: 1.0

     

    한글폰트 사용

    import matplotlib.pyplot as plt
    # 폰트설정
    plt.rcParams['font.family'] = 'Malgun Gothic'
    # %matplotlib inline
    # import matplotlib.pyplot as plt
    
    f, ax = plt.subplots(figsize=(7,5))
    ax.bar(range(0, len(rf.feature_importances_)),
          rf.feature_importances_)
    ax.set_title("특성중요도")

    print("특성수", rf.n_features_)
    print("모델", rf.estimators_)

    특성수 20
    모델 [DecisionTreeClassifier(max_features='auto', random_state=1395913497), DecisionTreeClassifier(max_features='auto', random_state=119341817), DecisionTreeClassifier(max_features='auto', random_state=1822058090), DecisionTreeClassifier(max_features='auto', random_state=522443302), DecisionTreeClassifier(max_features='auto', random_state=1597311244), DecisionTreeClassifier(max_features='auto', random_state=845995259), DecisionTreeClassifier(max_features='auto', random_state=101932588), DecisionTreeClassifier(max_features='auto', random_state=1791944844), DecisionTreeClassifier(max_features='auto', random_state=1257671594), DecisionTreeClassifier(max_features='auto', random_state=1278114999), DecisionTreeClassifier(max_features='auto', random_state=1628617292), DecisionTreeClassifier(max_features='auto', random_state=1995819024), DecisionTreeClassifier(max_features='auto', random_state=1723248772), DecisionTreeClassifier(max_features='auto', random_state=1122722585), DecisionTreeClassifier(max_features='auto', random_state=808735041), DecisionTreeClassifier(max_features='auto', random_state=670881162), DecisionTreeClassifier(max_features='auto', random_state=1642206646), DecisionTreeClassifier(max_features='auto', random_state=546048640), DecisionTreeClassifier(max_features='auto', random_state=1902826129), DecisionTreeClassifier(max_features='auto', random_state=2062332984), DecisionTreeClassifier(max_features='auto', random_state=1629183554), DecisionTreeClassifier(max_features='auto', random_state=97631387), DecisionTreeClassifier(max_features='auto', random_state=2020975063), DecisionTreeClassifier(max_features='auto', random_state=856768912), DecisionTreeClassifier(max_features='auto', random_state=1578801749), DecisionTreeClassifier(max_features='auto', random_state=1590223484), DecisionTreeClassifier(max_features='auto', random_state=1240823731), DecisionTreeClassifier(max_features='auto', random_state=164031397), DecisionTreeClassifier(max_features='auto', random_state=1729621726), DecisionTreeClassifier(max_features='auto', random_state=751674714)]

    # 문제 : load_boston() 을 이용해 데이터를 로딩하고 rf로 변수중요도를 출력하시오.
    from sklearn.datasets import load_boston # 회귀 or 분류
    from sklearn.ensemble import RandomForestRegressor
    # RandomForestRegressor = 연속형 데이터이기
    
    a = load_boston()
    X = a.data
    y = a.target
    names = a["feature_names"]
    rf = RandomForestRegressor() # 
    rf.fit(X,y)

    RandomForestRegressor()

     

    print(sorted(zip(map(lambda x: round(x,2),
                        rf.feature_importances_), names),reverse = True))

    [(0.49, 'RM'), (0.32, 'LSTAT'), (0.07, 'DIS'), (0.04, 'CRIM'), (0.03, 'NOX'), (0.02, 'PTRATIO'), (0.01, 'TAX'), (0.01, 'INDUS'), (0.01, 'B'), (0.01, 'AGE'), (0.0, 'ZN'), (0.0, 'RAD'), (0.0, 'CHAS')]

    import matplotlib.pyplot as plt
    f, ax = plt.subplots(figsize = (7,5))
    ax.bar(range(0,len(rf.feature_importances_)),
          rf.feature_importances_)
    ax.set_title("feature importance")

    연속형 데이터 평가

    일반적인 mse 평가 

     

    from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
    mean_squared_error(y, rf.predict(X))

    1.6167176877470346

     

    mae 평가

    mean_absolute_error(y, rf.predict(X))

     

    R2 score 평가 

    r2_score(y, rf.predict(X))
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split # 0.75 : 0.25
    cancer = load_breast_cancer()
    X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=42)
    
    forest = RandomForestClassifier(n_estimators = 100, random_state=42)
    forest.fit(X_train, y_train)

    RandomForestClassifier(random_state=42)


    predict

    #  predict : score 함수 / 꼭 해야함!!!!
    print('훈련 세트 정확도 {:.3f}'.format(forest.score(X_train,y_train)))
    print('테스트 세트 정확도 {:.3f}'.format(forest.score(X_test,y_test)))

    훈련 세트 정확도 1.000
    테스트 세트 정확도 0.965

    from sklearn.tree import export_graphviz
    export_graphviz(forest.estimators_[0], out_file="tree.dot",
                   class_names=["악성","양성"],
                   feature_names=cancer.feature_names,impurity=False, filled=True)
    from IPython.display import display
    import graphviz
    # 읽기용으로!
    with open("tree.dot", "rt", encoding='UTF-8') as f:
        dot_graph = f.read()
    display(graphviz.Source(dot_graph))


    XGBoost를 이용한 예측

    import pandas as pd
    from sklearn.datasets import load_boston
    boston = load_boston() # data (독립변수) , target(종속변수)
    data = pd.DataFrame(boston.data)
    data.columns = boston.feature_names # 열이름
    
    print(data.head())
    data['PRICE'] = boston.target # PRICE 로 열추가
    print(data.info())
    data.describe()

    <class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CRIM 506 non-null float64 1 ZN 506 non-null float64 2 INDUS 506 non-null float64 3 CHAS 506 non-null float64 4 NOX 506 non-null float64 5 RM 506 non-null float64 6 AGE 506 non-null float64 7 DIS 506 non-null float64 8 RAD 506 non-null float64 9 TAX 506 non-null float64 10 PTRATIO 506 non-null float64 11 B 506 non-null float64 12 LSTAT 506 non-null float64 13 PRICE 506 non-null float64 dtypes: float64(14) memory usage: 55.5 KB None

    import xgboost as xgb # model set!
    from sklearn.metrics import mean_squared_error # 평가 set
    X,y = data.iloc[:,:-1],data.iloc[:,-1]
    data_dmatrix = xgb.DMatrix(data=X, label= y) #xgb전용 행렬 모델임.!! 꼭해야함 data = 독립변수, label = 종속변수
    # 데이터프레임은 ndarray + dict (순서를 보장, 중복을 허용)
    from sklearn.model_selection import train_test_split
    import numpy as np
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=123)
    xg_reg = xgb.XGBRegressor(objective = 'reg:linear', # 선형회귀방식
                             colsample_bytree = 0.3,
                              # learning_rate = 경사하강법 등장
                              # optimization 을 찾기위해 경사에서 제일 아래인곳 찾는
                             learning_rate = 0.1, max_depth= 5, alpha = 10,
                              # max_depth 는 과적합방지
                             n_estimators = 10)
                                # model은 10개로만들어라
    xg_reg.fit(X_train,y_train)
    preds = xg_reg.predict(X_test) # ybar = 예측치
    rmse = np.sqrt(mean_squared_error(y_test,preds))
    print("RMSE: %f" % (rmse))
    import matplotlib.pyplot as plt
    xgb.plot_tree(xg_reg,num_trees=0)
    plt.rcParams['figure.figsize']=[500,200]
    plt.show()

    from numpy import loadtxt
    from xgboost import XGBClassifier
    from sklearn.metrics import accuracy_score
    dataset = loadtxt('pima.data', delimiter=",")
    X = dataset[:,0:8]
    Y = dataset[:,8]
    seed = 7
    test_size = 0.33
    X_train, X_test, y_train, y_test = train_test_split(X,Y,
                                                       test_size = test_size, random_state = seed)
    model = XGBClassifier()
    model.fit(X_train, y_train)
    print(model)
    from xgboost import plot_importance
    from matplotlib import pyplot
    plot_importance(model)
    plt.rcParams['figure.figsize']=[50,20]
    plt.show()

    y_pred = model.predict(X_test)
    print(y_pred)

    [0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1.
     0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 1. 1.
     0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1.
     0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1.
     1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1.
     0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0.
     0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0.
     0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0.
     1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0.
     0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0.
     0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1.]

    accuracy = accuracy_score(y_test,y_pred)
    print("정확도 : %.2f%%" % (accuracy * 100.0))

    정확도 : 74.02%

    from sklearn.feature_selection import SelectFromModel
    thresholds = np.sort(model.feature_importances_) # 오름차순
    print(thresholds)

    [0.08799455 0.08907107 0.09801765 0.09824965 0.09959184 0.13577047
     0.15170811 0.23959671]

    # 변수 선택법
    for thresh in thresholds:
        selection = SelectFromModel(model, threshold = thresh, prefit =True)
        select_X_train = selection.transform(X_train)
        # 경계선 이하의 중요성을 가진 변수 제거
        selection_model = XGBClassifier()
        selection_model.fit(select_X_train, y_train)
        select_X_test = selection.transform(X_test) # 테스터용 데이터변환
        y_pred = selection_model.predict(select_X_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)
        print("Thresh =%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy*100.0))
    import pickle
    pickle.dump(model, open("pima.pickle.dat", "wb")) # 모델 저장하기
    loaded_model = pickle.load(open("pima.pickle.dat", "rb")) # 모델 불러오기
    y_pred = loaded_model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test,predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))

    정리

    1. pipeline # 앞에서 전처리후에 파이프를 통해서 뒷단으로 넘기는것 전처리 하기전에 분리!! pipeline.(이름__Parameter)
    2. GridSeachCV # 노가다 안할라면~ Parameter(조합) 튜닝할때 사용 조합을 만들어서 테스트해줌.
    3. Dicision Tree
      • Regression
      • Classifier 3-1. Random Forest
      • 3-2. XGBoost
      • DMatrix 전용행렬
    4. 시각화
    5. 변수중요도
    6. Preprocessing . z점수(score)평균으로 빼고 표준편차로 나눈것, minmax, robust(IQR로 나눠줌)

    평가

    1. 평가 분류 : Confusion_matrix(시각화) / Classification_report(분류평가보고서)
    2. 평가 예측 : MSE(오차의제곱 후 root), RMSE(표준편차) 두개다 수치가 작을수록 좋은 모델
    반응형
Designed by Tistory.