宝慕林4294392
OP 这里,解决方案似乎是:import refrom nltk.util import ngramsOriginalBooksList = list()booksAfterRemovingStopWords = list()booksWithNGrams = list()stopWords = ['I', 'a', 'about', 'an', 'are', 'as', 'at', 'be', 'by', 'com', 'for', 'from', 'how', 'in', 'is', 'it', 'of', 'on', 'or', 'that', 'the', 'this', 'to', 'was', 'the', 'and', 'A', 'About', 'An', 'Are', 'As', 'At', 'Be', 'By', 'Com', 'For', 'From', 'How', 'In', 'Is', 'It', 'Of', 'On', 'Or', 'That', 'The', 'This', 'To', 'Was', 'The', 'And']with open('UnifiedBookList.txt') as fin: for line_no, line in enumerate(fin): OriginalBooksList.append(line) line = re.sub(r'[^\w\s]', ' ', line) # replace punctuation with space line = re.sub(' +', ' ', line) # replace multiple space with one line = line.lower() # to lower case if line.strip() and len(line.split()) > 2: # line can not be empty and line must have more than 2 words booksAfterRemovingStopWords.append(' '.join([i for i in line.split( ) if i not in stopWords])) # Remove Stop Words And Make Sentencefor line_no, line in enumerate(booksAfterRemovingStopWords): tokens = line.split(" ") output = list(ngrams(tokens, 3)) temp = list() temp.append(OriginalBooksList[line_no]) # Adding original line for x in output: # Adding n-grams temp.append(' '.join(x)) booksWithNGrams.append(temp)while booksWithNGrams: first_element = booksWithNGrams.pop(0) x = 0 for mylist in booksWithNGrams: if set(first_element) & set(mylist): if x == 0: print(first_element[0]) x = 1 # print(set(first_element) & set(mylist)) print(mylist[0]) booksWithNGrams.remove(mylist) x = 0