猿问

搜索位于 Word.docx TABLES 中的文本/字符串的文件夹目录

对于我的工作,我希望能够在大量 Word 文件中搜索特定文本。但是,此文本位于任意数量的表格中的 Word 文件(docx 和 doc)中。我正在尝试进行关键字搜索,只是为了打印文本所在的文件名。下面的代码运行但没有找到任何结果。请帮忙。


import pandas as pd

import numpy as np

import glob

import os

from os import listdir

from docx import Document

import re

import win32com.client as win32


Keyword = 'the'

#pattern = re.compile(Keyword)

documents = r'C:\Users\aac1928\Desktop\Test'


#Searches for Keywords in Converted Text Documents

for root, dirs, files in os.walk(documents, onerror=None):

    for filename in files:

        if filename.endswith(".doc") or filename.endswith("docx"):

            file_path = os.path.join(root, filename)

            try:

                    with open(file_path, "rb") as f:

                        doc = Document(f)

                        for table in doc.tables:

                            for i, row in enumerate(table.rows):

                                if Keyword in Cell.text:

                                    print(filename)

                                    continue

            except Exception:

                pass


白衣非少年
浏览 177回答 2
2回答

莫回无

在编写代码时,除了“一切”之外,尝试执行脚本并不是一个好主意(不是真的)。此外,您访问数据的方式似乎不正确,但这应该有效:import numpy as npimport globimport osfrom os import listdirfrom docx import Documentimport reimport win32com.client as win32import tracebackKeyword = 'the'#pattern = re.compile(Keyword)documents = r'C:\Users\aac1928\Desktop\Test'def find_word():    Keyword = 'the'    #pattern = re.compile(Keyword)    documents = r'/Users/marc/Documents'    #Searches for Keywords in Converted Text Documents    for root, dirs, files in os.walk(documents):        for filename in files:            print filename            if filename.endswith(".doc") or filename.endswith("docx"):                file_path = os.path.join(root, filename)                with open(file_path, "rb") as f:                    doc = Document(f)                    if search_doc(doc, Keyword):                        print file_pathdef search_doc(doc, Keyword):    for table in doc.tables:        for j, column in enumerate(table.columns):            for i, row in enumerate(table.rows):                if Keyword in table.cell(j, i).text:                    return True    return False这也确保您在找到关键字时停止在文档中查找并移至下一个文档,而不是在文档的多个表格中找到该关键字时多次打印文件名。

繁星淼淼

我是 Python 的新手,但是通过添加以下代码:&nbsp; &nbsp; except Exception:&nbsp; &nbsp; &nbsp; &nbsp; pass&nbsp; &nbsp; &nbsp; &nbsp; traceback.print_exc()在您的代码中,我可以看到未定义 Cell如果您将循环更改为一段时间,它将起作用,例如&nbsp;for table in doc.tables:&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; for row in table.rows:&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; i = 0&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; while i < len(row.cells):&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if Keyword in row.cells[i].text:&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; print(filename)&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; continue希望能帮助到你更新 :import numpy as npimport globimport osfrom os import listdirfrom docx import Documentimport reimport win32com.client as win32import tracebackKeyword = 'the'#pattern = re.compile(Keyword)documents = r'C:\Users\aac1928\Desktop\Test'#Searches for Keywords in Converted Text Documentsfor root, dirs, files in os.walk(documents, onerror=None):&nbsp; &nbsp; print("Here 1")&nbsp; &nbsp; for filename in files:&nbsp; &nbsp; &nbsp; &nbsp; print(filename)&nbsp; &nbsp; &nbsp; &nbsp; if filename.endswith(".doc") or filename.endswith("docx"):&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; file_path = os.path.join(root, filename)&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; print(file_path)&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; try:&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; with open(file_path, "rb") as f:&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; doc = Document(f)&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; for table in doc.tables:&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; for row in table.rows:&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; i = 0&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; while i < len(row.cells):&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if Keyword in row.cells[i].text:&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; print(filename)&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; continue&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; except Exception:&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pass&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; traceback.print_exc()更新 2:import numpy as npimport globimport osfrom os import listdirfrom docx import Documentimport reimport win32com.client as win32import tracebackKeyword = 'the'#pattern = re.compile(Keyword)documents = r'C:\Users\aac1928\Desktop\Test'documentsWithKeyword = []#Searches for Keywords in Converted Text Documentsfor root, dirs, files in os.walk(documents, onerror=None):&nbsp; &nbsp; print("Here 1")&nbsp; &nbsp; for filename in files:&nbsp; &nbsp; &nbsp; &nbsp; print(filename)&nbsp; &nbsp; &nbsp; &nbsp; if filename.endswith(".doc") or filename.endswith("docx"):&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; file_path = os.path.join(root, filename)&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; print(file_path)&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; try:&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; with open(file_path, "rb") as f:&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; doc = Document(f)&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; for table in doc.tables:&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; for row in table.rows:&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; i = 0&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; while i < len(row.cells):&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if Keyword in row.cells[i].text:&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; documentsWithKeyword.append(filename)&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; continue&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; except Exception:&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pass&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; traceback.print_exc()# remove duplicatesdocumentsWithKeyword = list(set(documentsWithKeyword))documentsWithKeyword.sort()#print documents that have the wordfor docwithKeyword in documentsWithKeyword&nbsp; &nbsp; print(docwithKeyword)分享
随时随地看视频慕课网APP

相关分类

Python
我要回答