如何改进 ML 模型以提高准确性

我正在编写一个处理情绪分析的 python 脚本,我对文本进行了预处理并对分类特征进行了矢量化并拆分了数据集,然后我使用了 LogisticRegression 模型,准确率达到了 84 %


当我上传一个新的数据集并尝试部署创建的模型时,我得到了51.84% 的准确率


代码:

    import pandas as pd

    import numpy as np

    import re

    import string

    from nltk.corpus import stopwords

    from nltk.tokenize import word_tokenize

    from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer

    from sklearn.model_selection import train_test_split

    from nltk.stem import PorterStemmer

    from nltk.stem import WordNetLemmatizer

    # ML Libraries

    from sklearn.metrics import accuracy_score

    from sklearn.linear_model import LogisticRegression

    from sklearn.model_selection import GridSearchCV

    

    stop_words = set(stopwords.words('english'))  

    import joblib

    

    def load_dataset(filename, cols):

        dataset = pd.read_csv(filename, encoding='latin-1')

        dataset.columns = cols

        return dataset

    

    dataset = load_dataset("F:\AIenv\sentiment_analysis\input_2_balanced.csv", ["id","label","date","text"])

    dataset.head()

    

    dataset['clean_text'] = dataset['text'].apply(processTweet)

    

    # create doc2vec vector columns

    from gensim.test.utils import common_texts

    from gensim.models.doc2vec import Doc2Vec, TaggedDocument

    

    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(dataset["clean_text"].apply(lambda x: x.split(" ")))]

    

    # train a Doc2Vec model with our text data

    model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

    

    # transform each document into a vector data

    doc2vec_df = dataset["clean_text"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)

    doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]

    dataset = pd.concat([dataset, doc2vec_df], axis=1)

    


HUH函数
浏览 175回答 4
4回答

手掌心

您的新数据可能与您用于训练和测试模型的第一个数据集有很大不同。预处理技术和统计分析将帮助您表征数据并比较不同的数据集。由于各种原因,可能会观察到新数据的性能不佳,包括:您的初始数据集在统计上不能代表更大的数据集(例如:您的数据集是一个极端案例)过度拟合:你过度训练你的模型,它包含训练数据的特异性(噪声)不同的预处理方法不平衡的训练数据集。ML 技术最适合平衡数据集(训练集中不同类别的平等出现)

守着星空守着你

我对情绪分析中不同分类的表现进行了调查研究。对于特定的推特数据集,我曾经执行逻辑回归、朴素贝叶斯、支持向量机、k 最近邻 (KNN) 和决策树等模型。对所选数据集的观察表明,Logistic 回归和朴素贝叶斯在所有类型的测试中都准确地表现良好。接下来是SVM。然后进行准确的决策树分类。从结果来看,KNN 的准确度得分最低。逻辑回归和朴素贝叶斯模型在情绪分析和预测方面分别表现更好。 情绪分类器(准确度分数 RMSE) LR (78.3541 1.053619) NB (76.764706 1.064738) SVM (73.5835 1.074752) DT (69.2941 1.145234) KNN (62.9476 1.376589)在这些情况下,特征提取非常关键。

largeQ

导入必需品import pandas as pdfrom sklearn import metricsfrom sklearn.model_selection import train_test_splitfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn.linear_model import LogisticRegressionimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom sklearn.model_selection import KFoldfrom sklearn.model_selection import cross_val_scorefrom sklearn.model_selection import GridSearchCVimport timedf = pd.read_csv('FilePath', header=0)X = df['content']y = df['sentiment']def lrSentimentAnalysis(n):    # Using CountVectorizer to convert text into tokens/features    vect = CountVectorizer(ngram_range=(1, 1))    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=n)    # Using training data to transform text into counts of features for each message    vect.fit(X_train)    X_train_dtm = vect.transform(X_train)    X_test_dtm = vect.transform(X_test)    # dual = [True, False]    max_iter = [100, 110, 120, 130, 140, 150]    C = [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5]    solvers = ['newton-cg', 'lbfgs', 'liblinear']    param_grid = dict(max_iter=max_iter, C=C, solver=solvers)    LR1 = LogisticRegression(penalty='l2', multi_class='auto')    grid = GridSearchCV(estimator=LR1, param_grid=param_grid, cv=10, n_jobs=-1)    grid_result = grid.fit(X_train_dtm, y_train)    # Summarize results    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))    y_pred = grid_result.predict(X_test_dtm)    print ('Accuracy Score: ', metrics.accuracy_score(y_test, y_pred) * 100, '%')    # print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred))    # print('MAE:', metrics.mean_absolute_error(y_test, y_pred))    # print('MSE:', metrics.mean_squared_error(y_test, y_pred))    print ('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))    return [n, metrics.accuracy_score(y_test, y_pred) * 100, grid_result.best_estimator_.get_params()['max_iter'],            grid_result.best_estimator_.get_params()['C'], grid_result.best_estimator_.get_params()['solver']]def darwConfusionMetrix(accList):    # Using CountVectorizer to convert text into tokens/features    vect = CountVectorizer(ngram_range=(1, 1))    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=accList[0])    # Using training data to transform text into counts of features for each message    vect.fit(X_train)    X_train_dtm = vect.transform(X_train)    X_test_dtm = vect.transform(X_test)    # Accuracy using Logistic Regression Model    LR = LogisticRegression(penalty='l2', max_iter=accList[2], C=accList[3], solver=accList[4])    LR.fit(X_train_dtm, y_train)    y_pred = LR.predict(X_test_dtm)    # creating a heatmap for confusion matrix    data = metrics.confusion_matrix(y_test, y_pred)    df_cm = pd.DataFrame(data, columns=np.unique(y_test), index=np.unique(y_test))    df_cm.index.name = 'Actual'    df_cm.columns.name = 'Predicted'    plt.figure(figsize=(10, 7))    sns.set(font_scale=1.4)  # for label size    sns.heatmap(df_cm, cmap="Blues", annot=True, annot_kws={"size": 16})  # font size    fig0 = plt.gcf()    fig0.show()    fig0.savefig('FilePath', dpi=100)def findModelWithBestAccuracy(accList):    accuracyList = []    for item in accList:        accuracyList.append(item[1])    N = accuracyList.index(max(accuracyList))    print('Best Model:', accList[N])    return accList[N]accList = []print('Logistic Regression')print('grid search method for hyperparameter tuning (accurcy by cross validation) ')for i in range(2, 7):    n = i / 10.0    print ("\nsplit ", i - 1, ": n=", n)    accList.append(lrSentimentAnalysis(n))darwConfusionMetrix(findModelWithBestAccuracy(accList))

幕布斯7119047

预处理是构建性能良好的分类器的重要部分。当您在训练和测试集性能之间存在如此大的差异时,很可能在您的(测试集)预处理中发生了一些错误。无需任何编程也可使用分类器。您可以访问 Web 服务洞察分类器并先尝试免费构建。
打开App,查看更多内容
随时随地看视频慕课网APP

相关分类

Python