计算pdf文件中每个单词的出现次数java

我正在使用 PDFbox 制作一个 java 程序,它读取任何 pdf 文件并计算每个单词在文件中出现的次数,但由于某种原因,当我运行程序时没有出现任何内容,我希望它打印每个单词以及出现的次数旁边的字。提前致谢。这是我的代码:


package lab8;


import java.io.File;

import java.io.FileNotFoundException;

import java.io.IOException;

import java.util.Map;

import java.util.TreeMap;

import java.util.Scanner;


import org.apache.pdfbox.pdmodel.PDDocument;

import org.apache.pdfbox.text.PDFTextStripper;


public class Extractor {



public static void main(String[] args) throws FileNotFoundException {

    Map<String, Integer> frequencies = new TreeMap<String, Integer>();

    PDDocument pd;

    File input = new File("C:\\Users\\Ammar\\Desktop\\Application.pdf"); 

    Scanner in = new Scanner(input);

    try {

        pd = PDDocument.load(input);

        PDFTextStripper stripper = new PDFTextStripper();

        stripper.setEndPage(20);

        String text = stripper.getText(pd);


        while (in.hasNext()) {

            String word = clean(in.next());


            if (word != "") {

                Integer count = frequencies.get(word);




                if (count == null) {

                    count = 1;

                } else {

                    count = count + 1;

                }


                frequencies.put(word, count);

            }

        }


        for (String key : frequencies.keySet()) {

            System.out.println(key + ": " + frequencies.get(key));

        }


        if (pd != null) {

            pd.close();

        }

    } catch (IOException e) {

        e.printStackTrace();

    }

   }


    private static String clean(String s) {

    String r = "";

    for (int i = 0; i < s.length(); i++) {

        char c = s.charAt(i);

        if (Character.isLetter(c)) {

            r = r + c;

        }

    }

    return r.toLowerCase();

   }


  }


阿晨1998
浏览 375回答 2
2回答

子衿沉夜

我试图解决这个逻辑。import java.io.File;import java.io.FileNotFoundException;import java.io.IOException;import java.util.Map;import java.util.TreeMap;import org.apache.pdfbox.pdmodel.PDDocument;import org.apache.pdfbox.text.PDFTextStripper;public class Extractor {&nbsp; &nbsp; public static void main(String[] args) throws FileNotFoundException {&nbsp; &nbsp; &nbsp; &nbsp; Map<String, Integer> wordFrequencies = new TreeMap<String, Integer>();&nbsp; &nbsp; &nbsp; &nbsp; Map<Character, Integer> charFrequencies = new TreeMap<Character, Integer>();&nbsp; &nbsp; &nbsp; &nbsp; PDDocument pd;&nbsp; &nbsp; &nbsp; &nbsp; File input = new File("C:\\Users\\Ammar\\Desktop\\Application.pdf");&nbsp; &nbsp; &nbsp; &nbsp; try {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pd = PDDocument.load(input);&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; PDFTextStripper stripper = new PDFTextStripper();&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; stripper.setEndPage(20);&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; String text = stripper.getText(pd);&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; for(int i=0; i<text.length(); i++)&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; char c = text.charAt(i);&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; int count = charFrequencies.get(c) != null ? (charFrequencies.get(c)) + 1 : 1;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; charFrequencies.put(c, count);&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; String[] texts = text.split(" ");&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; for (String txt : texts) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; int count = wordFrequencies.get(txt) != null ? (wordFrequencies.get(txt)) + 1 : 1;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; wordFrequencies.put(txt, count);&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; System.out.println("Printing the number of words");&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; for (String key : wordFrequencies.keySet()) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; System.out.println(key + ": " + wordFrequencies.get(key));&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; System.out.println("Printing the number of characters");&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; for (char charKey : charFrequencies.keySet()) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; System.out.println(charKey + ": " + charFrequencies.get(charKey));&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if (pd != null) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pd.close();&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; } catch (IOException e) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; e.printStackTrace();&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; }}试试这个代码。如果仍然存在一些问题并且您无法解决。我可以尝试解决。

慕标5832272

在您的代码中,您还可以通过传递您的字符串来使用 StringTokenizer 的对象,即StringTokenizer&nbsp;st&nbsp;=&nbsp;new&nbsp;StringTokenizer(stripper.getText(pd));并在 while 循环st.hasMoreTokens()中渲染每个单词String word = clean(st.nextToken());这也很好用。
打开App,查看更多内容
随时随地看视频慕课网APP

相关分类

Java