Python:显示字典单词的匹配键

我想在我的项目中显示字典单词的匹配键。我的代码当前输出的键,但对于您键入的任何单词,都使用相同的键。例如,如果我把返回的密钥将是.如果我把同样的钥匙将被归还。请参阅下面的代码,如果做错了什么,请告诉我'england played well'[737, 736, 735, 734, 733, 732, 731, 730, 729, 728]'Hello'


import re

import os

import math

import heapq


def readfile(path, docid):

    files = sorted(os.listdir(path))

    f = open(os.path.join(path, files[docid]), 'r',encoding='latin-1')

    s = f.read()

    f.close()

    return s


DELIM = '[ \n\t0123456789;:.,/\(\)\"\'-]+'


def tokenize(text):

    return re.split(DELIM, text.lower())


N = len(sorted(os.listdir('docs')))


def indextextfiles_RR(path):

    postings={}

    docLength = {}

    term_in_document = {}

    for docID in range(N):

        s = readfile(path, docID)

        words = tokenize(s)

        length = 0

        for w in words:

            if w!='':

                length += (math.log10(words.count(w)))**2

        docLength[docID] = math.sqrt(length)

        for w in words:

            if w!='':

                doc_length = math.log10(words.count(w))/docLength[docID]

                term_in_document.setdefault(doc_length, set()).add(docID)

                postings[w] = term_in_document

    return postings



def query_RR(postings, qtext):

    words = tokenize(qtext)

    doc_scores = {}

    for docID in range(N):

        score = 0

        for w in words:

            tf = words.count(w)

            df = len(postings[w])

            idf = math.log10(N / (df+1))

            query_weights = tf * idf

        for w in words:

            if w in postings:

                score = score + query_weights

        doc_scores[docID] = score

    res = heapq.nlargest(10, doc_scores)

    return res


postings = indextextfiles_RR('docs')

print(query_RR(postings, 'hello'))

当我运行帖子时,它应该返回hello和与之关联的键列表。


芜湖不芜
浏览 100回答 1
1回答

拉丁的传说

最有可能的是,您的错误来自您对每个文件中的所有单词使用相同的字典。term_in_document几条评论len(sorted(...))它浪费资源对不需要排序的东西(排序并不便宜),因为你只得到长度。按数字读取文件根本没有意义,要做到这一点,您最终会调用文件系统多个时间来读取整个目录的文件名,因为您每次读取一个目录时都会列出文件。文件应该在处理为我们关闭文件的语句中打开。with变量和函数应使用,而类应使用 。this_notationThisNotation您在单词列表上迭代两次只是为了获得十进制对数。之后的逻辑非常令人困惑,您似乎正在对每个单词出现次数的十进制对数进行RMS(均方根),但您不会将其除以单词数。之后,你又得到了对数。你应该更好地定义你的问题。当我获得新信息时,我将编辑我的答案。import reimport osimport mathimport heapqdef read_file(path):    with open(path, 'r', encoding='latin-1') as f:        return f.read()DELIM = '[ \n\t0123456789;:.,/\(\)\"\'-]+'def tokenize(text):    return re.split(DELIM, text.lower())def index_text_files_rr(path):    postings = {}    doc_lengths = {}    term_in_document = {}    files = sorted(os.listdir(path))    for i, file in enumerate(files):        file_path = os.path.join(path, file)        s = read_file(file_path)        words = tokenize(s)        length = 0        # We will store pairs of the word with the decimal logarithm of        # the word count here to use it later        words_and_logs = []        for word in words:            # Discard empty words            if word != '':                # Compute the decimal logarithm of the word count                log = math.log10(words.count(word))                # Add the square of the decimal logarithm to the length                length += log**2                # Store the word and decimal logarithm pair                words_and_logs.append((word, log))        # Compute the square root of the sum of the squares        # of the decimal logarithms of the words count        doc_lengths[i] = math.sqrt(length)        # Iterate over our stored pairs where we already have the        # decimal logarithms computed so we do not have to do it again        for word, log in words_and_logs:            # No need to discard empty words here as we discarded them before            # so words_and_logs will not have the empty word            term_in_document.setdefault(log / doc_lengths[i], set()).add(i)            postings[w] = term_in_document    return postingsdef query_rr(postings, qtext):    words = tokenize(qtext)    doc_scores = {}    for i in range(N):        score = 0        for w in words:            tf = words.count(w)            df = len(postings[w])            idf = math.log10(N / (df+1))            query_weights = tf * idf        for w in words:            if w in postings:                score = score + query_weights        doc_scores[i] = score    res = heapq.nlargest(10, doc_scores)    return respostings = index_text_files_rr('docs')print(query_rr(postings, 'hello'))
打开App,查看更多内容
随时随地看视频慕课网APP

相关分类

Python