标记停止字生成的标记 ['ha', 'le', 'u', 'wa'] 而不是stop_words

我正在使用Python制作一个聊天机器人。法典:


import nltk

import numpy as np

import random

import string 

f=open('/home/hostbooks/ML/stewy/speech/chatbot.txt','r',errors = 'ignore')

raw=f.read()

raw=raw.lower()# converts to lowercase


sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences 

word_tokens = nltk.word_tokenize(raw)# converts to list of words


lemmer = nltk.stem.WordNetLemmatizer()    


def LemTokens(tokens):

    return [lemmer.lemmatize(token) for token in tokens]


remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)


def LemNormalize(text):

    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))


GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey","hii")

GREETING_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me"]



def greeting(sentence):

    for word in sentence.split():

        if word.lower() in GREETING_INPUTS:

            return random.choice(GREETING_RESPONSES)


from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.metrics.pairwise import cosine_similarity


def response(user_response):

    robo_response=''

    sent_tokens.append(user_response)    


    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')

    tfidf = TfidfVec.fit_transform(sent_tokens)

    vals = cosine_similarity(tfidf[-1], tfidf)

    idx=vals.argsort()[0][-2]

    flat = vals.flatten()

    flat.sort()

    req_tfidf = flat[-2]    


    if(req_tfidf==0):

        robo_response=robo_response+"I am sorry! I don't understand you"

        return robo_response

    else:

        robo_response = robo_response+sent_tokens[idx]

        return robo_response


UYOU
浏览 146回答 1
1回答

慕码人2483693

原因是您已经使用了自定义和默认,因此在提取要素时,请检查和 之间是否存在任何不一致tokenizerstop_words='english'stop_wordstokenizer如果您深入研究代码,您会发现此代码片段正在执行一致性检查:sklearn/feature_extraction/text.pydef _check_stop_words_consistency(self, stop_words, preprocess, tokenize):    """Check if stop words are consistent    Returns    -------    is_consistent : True if stop words are consistent with the preprocessor                    and tokenizer, False if they are not, None if the check                    was previously performed, "error" if it could not be                    performed (e.g. because of the use of a custom                    preprocessor / tokenizer)    """    if id(self.stop_words) == getattr(self, '_stop_words_id', None):        # Stop words are were previously validated        return None    # NB: stop_words is validated, unlike self.stop_words    try:        inconsistent = set()        for w in stop_words or ():            tokens = list(tokenize(preprocess(w)))            for token in tokens:                if token not in stop_words:                    inconsistent.add(token)        self._stop_words_id = id(self.stop_words)        if inconsistent:            warnings.warn('Your stop_words may be inconsistent with '                          'your preprocessing. Tokenizing the stop '                          'words generated tokens %r not in '                          'stop_words.' % sorted(inconsistent))如您所见,如果发现不一致,它会引发警告。
打开App,查看更多内容
随时随地看视频慕课网APP

相关分类

Python