将 n 元语法与组重复项进行比较

OP 这里，解决方案似乎是：import refrom nltk.util import ngramsOriginalBooksList = list()booksAfterRemovingStopWords = list()booksWithNGrams = list()stopWords = ['I', 'a', 'about', 'an', 'are', 'as', 'at', 'be', 'by', 'com', 'for', 'from', 'how', 'in', 'is', 'it', 'of', 'on', 'or', 'that', 'the', 'this', 'to', 'was', 'the',             'and', 'A', 'About', 'An', 'Are', 'As', 'At', 'Be', 'By', 'Com', 'For', 'From', 'How', 'In', 'Is', 'It', 'Of', 'On', 'Or', 'That', 'The', 'This', 'To', 'Was', 'The', 'And']with open('UnifiedBookList.txt') as fin:    for line_no, line in enumerate(fin):        OriginalBooksList.append(line)        line = re.sub(r'[^\w\s]', ' ', line)  # replace punctuation with space        line = re.sub(' +', ' ', line)  # replace multiple space with one        line = line.lower()  # to lower case        if line.strip() and len(line.split()) > 2:  # line can not be empty and line must have more than 2 words            booksAfterRemovingStopWords.append(' '.join([i for i in line.split(            ) if i not in stopWords]))  # Remove Stop Words And Make Sentencefor line_no, line in enumerate(booksAfterRemovingStopWords):    tokens = line.split(" ")    output = list(ngrams(tokens, 3))    temp = list()    temp.append(OriginalBooksList[line_no])  # Adding original line    for x in output:  # Adding n-grams        temp.append(' '.join(x))    booksWithNGrams.append(temp)while booksWithNGrams:    first_element = booksWithNGrams.pop(0)    x = 0    for mylist in booksWithNGrams:        if set(first_element) & set(mylist):            if x == 0:                print(first_element[0])                x = 1                # print(set(first_element) & set(mylist))            print(mylist[0])            booksWithNGrams.remove(mylist)    x = 0

将 n 元语法与组重复项进行比较

1回答