检查数组的某个元素是否与以下相同

我正在创建一个将 pdf 解析为文本的服务。当我有该文本时,我必须匹配一组单词。每次匹配时,它都会增加一个计数器。到目前为止,一切都很好。困难在于,在解析为文本时,我无法检查我在 pdf 的哪一页。我已经意识到,在拆分中,每次出现两个连续的换行符(/n/n)就意味着发生了换页。


我想做的是检查页面是否已更改,并且除了计算一个单词总共被找到的次数外,还要说出它在哪些页面上。


例子


let data =  `resignations / resignations. adm. mancom .: berenguer llinares

appointments. adm. unique: calvo valenzuela. other concepts: change of the administrative body:

joint administrators to sole administrator. change of registered office. ptda colomer, 6


Official Gazette of the Commercial Registry

no. 182 Friday, September 18, 2020 p. 33755

cve: borme-a-2020-182-03 verifiable in

sarria). registry data. t 2257, f 100, s 8, h a 54815, i / a 4 (10.09.20) .`




let wordsToSearch = ['resignations', "administrators"]


    wordsToSearch.forEach((word) => {

// inside of here would like to have track of the page as well

        let stringArray = data.split(' ');

        let count = 0;

        let result = ""

        for (var i = 0; i < stringArray.length; i++) {

            let wordText = stringArray[i];

            if (new RegExp(word).test(wordText)) {

                count++

            }

        }

        // the expected result would word has appeared count times in the pages etc

        result += `${word} has appeared ${count} times\n`

        console.log(result)

        /*

        resignations has appeared 2 times


        administrators has appeared 1 times

        */

    })


如果有人也想出另一种方法,那就太好了


海绵宝宝撒
浏览 40回答 1
1回答

莫回无

您可以在那些双换行符处拆分文本,然后单独分析每个页面。我会这样做:let data = `resignations / Friday resignations. adm. mancom .: berenguer llinares&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; appointments. adm. unique: calvo Friday valenzuela. other concepts: change of the administrative body:&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; joint administrators to sole administrator. change of registered office. ptda colomer, 6, Friday&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; Official Gazette of the Commercial Registry&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; no. 182 Friday, September 18, 2020 p. 33755&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; cve: borme-a-2020-182-03 verifiable in&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; sarria). registry data. t 2257, f 100, s 8, h a 54815, i / a 4 (10.09.20) .`function analyseText(text, wordsToFind) {&nbsp; &nbsp; const pages = data.split("\n\n");&nbsp; &nbsp; const result = {};&nbsp; &nbsp; for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) {&nbsp; &nbsp; &nbsp; &nbsp; analysePage({&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pageIndex,&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pageText: pages[pageIndex]&nbsp; &nbsp; &nbsp; &nbsp; }, wordsToFind, result);&nbsp; &nbsp; }&nbsp; &nbsp; return Object.keys(result).map(k => result[k]);}function analysePage(page, wordsToFind, result) {&nbsp; &nbsp; const {&nbsp; &nbsp; &nbsp; &nbsp; pageText,&nbsp; &nbsp; &nbsp; &nbsp; pageIndex&nbsp; &nbsp; } = page;&nbsp; &nbsp; wordsToFind.forEach(word => {&nbsp; &nbsp; &nbsp; &nbsp; const count = (pageText.match(new RegExp(word, 'g')) || []).length;&nbsp; &nbsp; &nbsp; &nbsp; if (count > 0) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if (!result[word]) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; result[word] = {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; name: word,&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pageIndices: [],&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; count: 0&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; };&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; result[word].pageIndices.push(pageIndex);&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; result[word].count += count;&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; });}const result = analyseText(data, ['resignations', "administrators", "Friday"]);console.log(result);在此示例中,我只是打印每一页的结果,但您当然可以构建一些结果对象,在其中保存每一页的结果。
打开App,查看更多内容
随时随地看视频慕课网APP

相关分类

JavaScript