我正在使用 Twitter 的情绪数据集对情绪进行分类。为了实现这一点,我写了下面的代码,但是当我训练它时,我得到了损失 NaN。我无法理解问题所在。虽然我设法找到了问题的解决方案,但为什么问题首先发生在我不明白的地方。
代码 :
import pandas as pd
import numpy as np
import re
cols = ["id","text","emotion","intensity"]
anger_df_train = pd.read_csv("D:/Dataset/twitter_emotion/train/anger.csv",delimiter='\t',names=cols)
fear_df_train = pd.read_csv("D:/Dataset/twitter_emotion/train/fear.csv",delimiter='\t',names=cols)
joy_df_train = pd.read_csv("D:/Dataset/twitter_emotion/train/joy.csv",delimiter='\t',names=cols)
sadness_df_train = pd.read_csv("D:/Dataset/twitter_emotion/train/sadness.csv",delimiter='\t',names=cols)
df_train = pd.concat([anger_df_train,fear_df_train,joy_df_train,sadness_df_train])
import spacy
nlp = spacy.load('en_core_web_md')
doc = nlp("The big grey dog ate all of the chocolate, but fortunately he wasn't sick!")
def spacy_tokenizer(sentence):
emails = '[A-Za-z0-9]+@[a-zA-Z].[a-zA-Z]+'
websites = '(http[s]*:[/][/])[a-zA-Z0-9]'
mentions = '@[A-Za-z0-9]+'
sentence = re.sub(emails,'',sentence)
sentence = re.sub(websites,'',sentence)
sentence = re.sub(mentions,'',sentence)
sentence_list=[word.lemma_ for word in nlp(sentence) if not (word.is_stop or word.is_space or word.like_num or len(word)==1)]
return ' '.join(sentence_list)
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
df_train['new_text']=df_train['text'].apply(spacy_tokenizer)
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df_train['new_text'].values)
sequences = tokenizer.texts_to_sequences(df_train['new_text'].values)
text_embedding = np.zeros((len(word_index)+1,300))
for word,i in word_index.items():
text_embedding[i]=nlp(word).vector
labels = df_train['emotion'].unique()
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)
蝴蝶刀刀
相关分类