machine-learning - 一类 SVM 算法耗时过长

machine-learning - 一类 SVM 算法耗时过长

下面的数据显示了我的数据集的一部分,用于检测异常

describe_file data_numbers index

0 gkivdotqvj 7309.0 0

1 hpwgzodlky 2731.0 1

2 dgaecubawx 0.0 2

3 NaN 0.0 3

4 lnpeyxsrrc 0.0 4

我使用一类 SVM 算法来检测异常

from pyod.models.ocsvm import OCSVM

random_state = np.random.RandomState(42)

outliers_fraction = 0.05

classifiers = {

'One Classify SVM (SVM)':OCSVM(kernel='rbf', degree=3, gamma='auto', coef0=0.0, tol=0.001, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1, contamination=outliers_fraction)

}

X = data['data_numbers'].values.reshape(-1,1)

for i, (clf_name, clf) in enumerate(classifiers.items()):

clf.fit(X)

# predict raw anomaly score

scores_pred = clf.decision_function(X) * -1

# prediction of a datapoint category outlier or inlier

y_pred = clf.predict(X)

n_inliers = len(y_pred) - np.count_nonzero(y_pred)

n_outliers = np.count_nonzero(y_pred == 1)

# copy of dataframe

dfx = data[['index', 'data_numbers']]

dfx['outlier'] = y_pred.tolist()

IX1 = np.array(dfx['data_numbers'][dfx['outlier'] == 0]).reshape(-1,1)

OX1 = dfx['data_numbers'][dfx['outlier'] == 1].values.reshape(-1,1)

print('OUTLIERS : ',n_outliers,'INLIERS : ',n_inliers, clf_name)

# threshold value to consider a datapoint inlier or outlier

threshold = stats.scoreatpercentile(scores_pred,100 * outliers_fraction)

tOut = stats.scoreatpercentile(dfx[dfx['outlier'] == 1]['data_numbers'], np.abs(threshold))

y = dfx['outlier'].values.reshape(-1,1)

def severity_validation():

tOUT10 = tOut+(tOut*0.10)

tOUT23 = tOut+(tOut*0.23)

tOUT45 = tOut+(tOut*0.45)

dfx['test_severity'] = "None"

for i, row in dfx.iterrows():

if row['outlier']==1:

if row['data_numbers'] <=tOUT10:

dfx['test_severity'][i] = "Low Severity"

elif row['data_numbers'] <=tOUT23:

dfx['test_severity'][i] = "Medium Severity"

elif row['data_numbers'] <=tOUT45:

dfx['test_severity'][i] = "High Severity"

else:

dfx['test_severity'][i] = "Ultra High Severity"

severity_validation()

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(dfx[['index','data_numbers']], dfx.outlier, test_size=0.25,

stratify=dfx.outlier, random_state=30)

#Instantiate Classifier

normer = preprocessing.Normalizer()

svm1 = svm.SVC(probability=True, class_weight={1: 10})

cached = mkdtemp()

memory = Memory(cachedir=cached, verbose=3)

pipe_1 = Pipeline(steps=[('normalization', normer), ('svm', svm1)], memory=memory)

cv = skl.model_selection.KFold(n_splits=5, shuffle=True, random_state=42)

param_grid = [ {"svm__kernel": ["linear"], "svm__C": [0.5]}, {"svm__kernel": ["rbf"], "svm__C": [0.5], "svm__gamma": [5]} ]

grd = GridSearchCV(pipe_1, param_grid, scoring='roc_auc', cv=cv)

#Training

y_pred = grd.fit(X_train, Y_train).predict(X_test)

rmtree(cached)

#Evaluation

confmatrix = skl.metrics.confusion_matrix(Y_test, y_pred)

print(confmatrix)

Y_pred = grd.fit(X_train, Y_train).predict_proba(X_test)[:,1]

def plot_roc(y_test, y_pred):

fpr, tpr, thresholds = skl.metrics.roc_curve(y_test, y_pred, pos_label=1)

roc_auc = skl.metrics.auc(fpr, tpr)

plt.figure()

lw = 2

plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area ={0:.2f})'.format(roc_auc))

plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')

plt.xlim([0.0, 1.0])

plt.ylim([0.0, 1.05])

plt.xlabel('False Positive Rate')

plt.ylabel('True Positive Rate')

plt.title('Receiver operating characteristic example')

plt.legend(loc="lower right")

plt.show();

plot_roc(Y_test, Y_pred)

我的数据集很大,有数百万行。结果我只能运行几十万行。代码工作得很好,但是它需要的时间太长了,所以我希望得到一些优化的建议,这样我运行得更快。

相关推荐

洞螈:独特物种的10个惊人事实与秘密 🦎

洞螈:独特物种的10个惊人事实与秘密 🦎

365bet平台开户 06-29
阅读更多
新兵连结束之后,下一步会去哪里?看完你就知道了

新兵连结束之后,下一步会去哪里?看完你就知道了

日博365网 07-10
阅读更多
国际足协世界杯冠军球员列表

国际足协世界杯冠军球员列表

日博365网 06-27
阅读更多