| |
| """ |
| Created on Fri Apr 26 16:31:20 2019 |
| |
| @author: ELİF NUR |
| """ |
| import pandas as pd |
| import numpy as np |
| from sklearn.model_selection import train_test_split |
| from sklearn.preprocessing import LabelEncoder |
| from sklearn import preprocessing |
|
|
| def loadData(fromPath,LabelColumnName,labelCount): |
| |
| data_=pd.read_csv(fromPath) |
| if labelCount==2: |
| dataset=data_ |
| dataset[LabelColumnName]=dataset[LabelColumnName].apply({'DoS':'Anormal','BENIGN':'Normal' ,'DDoS':'Anormal', 'PortScan':'Anormal'}.get) |
| else: |
| dataset=data_ |
| data=dataset[LabelColumnName].value_counts() |
| data.plot(kind='pie') |
| featureList= dataset.drop([LabelColumnName],axis=1).columns |
| return dataset,featureList |
|
|
| def datasetSplit(df,LabelColumnName): |
| labelencoder = LabelEncoder() |
| df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1]) |
| X = df.drop([LabelColumnName],axis=1) |
| X = np.array(X) |
| X = X.T |
| for column in X: |
| median = np.nanmedian(column) |
| column[np.isnan(column)] = median |
| column[column == np.inf] = 0 |
| column[column == -np.inf] = 0 |
| X = X.T |
| scaler = preprocessing.MinMaxScaler() |
| X= scaler.fit_transform(X) |
| y=df[[LabelColumnName]] |
| return X,y |
|
|
| def train_test_dataset(df): |
| labelencoder = LabelEncoder() |
| df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1]) |
| X = df.drop([LabelColumnName],axis=1) |
| y=df[[LabelColumnName]] |
| X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.7, test_size = 0.3, random_state = 0, stratify = y) |
| X_train = np.array(X_train) |
| X_train = X_train.T |
| for column in X_train: |
| median = np.nanmedian(column) |
| column[np.isnan(column)] = median |
| column[column == np.inf] = 0 |
| column[column == -np.inf] = 0 |
| X_train = X_train.T |
| y_train = np.array(y_train) |
| y_train = y_train.T |
| for column in y_train: |
| median = np.nanmedian(column) |
| column[np.isnan(column)] = median |
| column[column == np.inf] = 0 |
| column[column == -np.inf] = 0 |
| y_train = y_train.T |
| X_test = np.array(X_test) |
| X_test = X_test.T |
| for column in X_test: |
| median = np.nanmedian(column) |
| column[np.isnan(column)] = median |
| column[column == np.inf] = 0 |
| column[column == -np.inf] = 0 |
| X_test = X_test.T |
| y_test = np.array(y_test) |
| y_test = y_test.T |
| for column in y_test: |
| median = np.nanmedian(column) |
| column[np.isnan(column)] = median |
| column[column == np.inf] = 0 |
| column[column == -np.inf] = 0 |
| y_test = y_test.T |
| |
| |
| return X_train, X_test, y_train, y_test |
|
|
|
|