使用 iText 替换 PDF 文件中的文本

正如评论和答案中已经提到的，PDF 不是一种用于文本编辑的格式。它是最终格式，有关文本流、布局甚至到 Unicode 的映射的信息都是可选的。因此，即使假设存在关于将字形映射到 Unicode 的可选信息，使用 iText 完成此任务的方法可能看起来有点不令人满意：首先使用自定义文本提取策略确定相关文本的位置，然后继续删除该位置所有内容的当前内容PdfCleanUpProcessor，最后将替换文本绘制到间隙中。在这个答案中，我将提供一个帮助程序类，允许结合前两个步骤，查找和删除现有文本，其优点是实际上只删除文本，而不是任何背景图形等，就像PdfCleanUpProcessor编辑的情况一样。助手还返回被移除文本的位置，允许在其上标记替换。helper 类基于此较早答案PdfContentStreamEditor中提供的内容。不过，请使用github 上此类的版本，因为原始类自构想以来已得到一些增强。helperSimpleTextRemover类说明了从 PDF 中正确删除文本所必需的内容。其实限制在几个方面：它只替换实际页面内容流中的文本。要同时替换嵌入式 XObject 中的文本，必须递归地遍历相关页面的 XObject 资源，并将编辑器应用于它们。它的“简单”方式与以下方式相同SimpleTextExtractionStrategy：它假定显示说明的文本按阅读顺序出现在内容中。还要处理顺序不同且指令必须排序的内容流，这意味着所有传入指令和相关呈现信息必须缓存到页面末尾，而不仅仅是一次几个指令。然后可以对渲染信息进行排序，可以在排序后的渲染信息中标识要移除的部分，可以操纵相关联的指令，并且最终可以存储指令。它不会尝试识别在视觉上代表空白的字形之间的间隙，而实际上根本没有字形。要识别间隙，必须扩展代码以检查两个连续的字形是否完全相继，或者是否存在间隙或跳行。在计算删除字形的间隙时，它还没有考虑字符和单词的间距。要改进这一点，必须改进字形宽度计算。但是，考虑到您的内容流中的示例摘录，这些限制可能不会妨碍您。public class SimpleTextRemover extends PdfContentStreamEditor {    public SimpleTextRemover() {        super (new SimpleTextRemoverListener());        ((SimpleTextRemoverListener)getRenderListener()).simpleTextRemover = this;    }    /**     * <p>Removes the string to remove from the given page of the     * document in the PDF reader the given PDF stamper works on.</p>     * <p>The result is a list of glyph lists each of which represents     * a match can can be queried for position information.</p>     */    public List<List<Glyph>> remove(PdfStamper pdfStamper, int pageNum, String toRemove) throws IOException {        if (toRemove.length()  == 0)            return Collections.emptyList();        this.toRemove = toRemove;        cachedOperations.clear();        elementNumber = -1;        pendingMatch.clear();        matches.clear();        allMatches.clear();        editPage(pdfStamper, pageNum);        return allMatches;    }    /**     * Adds the given operation to the cached operations and checks     * whether some cached operations can meanwhile be processed and     * written to the result content stream.     */    @Override    protected void write(PdfContentStreamProcessor processor, PdfLiteral operator, List<PdfObject> operands) throws IOException {        cachedOperations.add(new ArrayList<>(operands));        while (process(processor)) {            cachedOperations.remove(0);        }    }    /**     * Removes any started match and sends all remaining cached     * operations for processing.     */    @Override    public void finalizeContent() {        pendingMatch.clear();        try {            while (!cachedOperations.isEmpty()) {                if (!process(this)) {                    // TODO: Should not happen, so warn                    System.err.printf("Failure flushing operation %s; dropping.\n", cachedOperations.get(0));                }                cachedOperations.remove(0);            }        } catch (IOException e) {            throw new ExceptionConverter(e);        }    }    /**     * Tries to process the first cached operation. Returns whether     * it could be processed.     */    boolean process(PdfContentStreamProcessor processor) throws IOException {        if (cachedOperations.isEmpty())            return false;        List<PdfObject> operands = cachedOperations.get(0);        PdfLiteral operator = (PdfLiteral) operands.get(operands.size() - 1);        String operatorString = operator.toString();        if (TEXT_SHOWING_OPERATORS.contains(operatorString))            return processTextShowingOp(processor, operator, operands);        super.write(processor, operator, operands);        return true;    }    /**     * Tries to processes a text showing operation. Unless a match     * is pending and starts before the end of the argument of this     * instruction, it can be processed. If the instructions contains     * a part of a match, it is transformed to a TJ operation and     * the glyphs in question are replaced by text position adjustments.     * If the original operation had a side effect (jump to next line     * or spacing adjustment), this side effect is explicitly added.     */    boolean processTextShowingOp(PdfContentStreamProcessor processor, PdfLiteral operator, List<PdfObject> operands) throws IOException {        PdfObject object = operands.get(operands.size() - 2);        boolean isArray = object instanceof PdfArray;        PdfArray array = isArray ? (PdfArray) object : new PdfArray(object);        int elementCount = countStrings(object);        // Currently pending glyph intersects parameter of this operation -> cannot yet process        if (!pendingMatch.isEmpty() && pendingMatch.get(0).elementNumber < processedElements + elementCount)            return false;        // The parameter of this operation is subject to a match -> copy as is        if (matches.size() == 0 || processedElements + elementCount <= matches.get(0).get(0).elementNumber || elementCount == 0) {            super.write(processor, operator, operands);            processedElements += elementCount;            return true;        }        // The parameter of this operation contains glyphs of a match -> manipulate         PdfArray newArray = new PdfArray();        for (int arrayIndex = 0; arrayIndex < array.size(); arrayIndex++) {            PdfObject entry = array.getPdfObject(arrayIndex);            if (!(entry instanceof PdfString)) {                newArray.add(entry);            } else {                PdfString entryString = (PdfString) entry;                byte[] entryBytes = entryString.getBytes();                for (int index = 0; index < entryBytes.length; ) {                    List<Glyph> match = matches.size() == 0 ? null : matches.get(0);                    Glyph glyph = match == null ? null : match.get(0);                    if (glyph == null || processedElements < glyph.elementNumber) {                        newArray.add(new PdfString(Arrays.copyOfRange(entryBytes, index, entryBytes.length)));                        break;                    }                    if (index < glyph.index) {                        newArray.add(new PdfString(Arrays.copyOfRange(entryBytes, index, glyph.index)));                        index = glyph.index;                        continue;                    }                    newArray.add(new PdfNumber(-glyph.width));                    index++;                    match.remove(0);                    if (match.isEmpty())                        matches.remove(0);                }                processedElements++;            }        }        writeSideEffect(processor, operator, operands);        writeTJ(processor, newArray);        return true;    }    /**     * Counts the strings in the given argument, itself a string or     * an array containing strings and non-strings.     */    int countStrings(PdfObject textArgument) {        if (textArgument instanceof PdfArray) {            int result = 0;            for (PdfObject object : (PdfArray)textArgument) {                if (object instanceof PdfString)                    result++;            }            return result;        } else             return textArgument instanceof PdfString ? 1 : 0;    }    /**     * Writes side effects of a text showing operation which is going to be     * replaced by a TJ operation. Side effects are line jumps and changes     * of character or word spacing.     */    void writeSideEffect(PdfContentStreamProcessor processor, PdfLiteral operator, List<PdfObject> operands) throws IOException {        switch (operator.toString()) {        case "\"":            super.write(processor, OPERATOR_Tw, Arrays.asList(operands.get(0), OPERATOR_Tw));            super.write(processor, OPERATOR_Tc, Arrays.asList(operands.get(1), OPERATOR_Tc));        case "'":            super.write(processor, OPERATOR_Tasterisk, Collections.singletonList(OPERATOR_Tasterisk));        }    }    /**     * Writes a TJ operation with the given array unless array is empty.     */    void writeTJ(PdfContentStreamProcessor processor, PdfArray array) throws IOException {        if (!array.isEmpty()) {            List<PdfObject> operands = Arrays.asList(array, OPERATOR_TJ);            super.write(processor, OPERATOR_TJ, operands);        }    }    /**     * Analyzes the given text render info whether it starts a new match or     * finishes / continues / breaks a pending match. This method is called     * by the {@link SimpleTextRemoverListener} registered as render listener     * of the underlying content stream processor.     */    void renderText(TextRenderInfo renderInfo) {        elementNumber++;        int index = 0;        for (TextRenderInfo info : renderInfo.getCharacterRenderInfos()) {            int matchPosition = pendingMatch.size();            pendingMatch.add(new Glyph(info, elementNumber, index));            if (!toRemove.substring(matchPosition, matchPosition + info.getText().length()).equals(info.getText())) {                reduceToPartialMatch();            }            if (pendingMatch.size() == toRemove.length()) {                matches.add(new ArrayList<>(pendingMatch));                allMatches.add(new ArrayList<>(pendingMatch));                pendingMatch.clear();            }            index++;        }    }    /**     * Reduces the current pending match to an actual (partial) match     * after the addition of the next glyph has invalidated it as a     * whole match.     */    void reduceToPartialMatch() {        outer:        while (!pendingMatch.isEmpty()) {            pendingMatch.remove(0);            int index = 0;            for (Glyph glyph : pendingMatch) {                if (!toRemove.substring(index, index + glyph.text.length()).equals(glyph.text)) {                    continue outer;                }                index++;            }            break;        }    }    String toRemove = null;    final List<List<PdfObject>> cachedOperations = new LinkedList<>();    int elementNumber = -1;    int processedElements = 0;    final List<Glyph> pendingMatch = new ArrayList<>();    final List<List<Glyph>> matches = new ArrayList<>();    final List<List<Glyph>> allMatches = new ArrayList<>();    /**     * Render listener class used by {@link SimpleTextRemover} as listener     * of its content stream processor ancestor. Essentially it forwards     * {@link TextRenderInfo} events and ignores all else.     */    static class SimpleTextRemoverListener implements RenderListener {        @Override        public void beginTextBlock() { }        @Override        public void renderText(TextRenderInfo renderInfo) {            simpleTextRemover.renderText(renderInfo);        }        @Override        public void endTextBlock() { }        @Override        public void renderImage(ImageRenderInfo renderInfo) { }        SimpleTextRemover simpleTextRemover = null;    }    /**     * Value class representing a glyph with information on     * the displayed text and its position, the overall number     * of the string argument of a text showing instruction     * it is in and the index at which it can be found therein,     * and the width to use as text position adjustment when     * replacing it. Beware, the width does not yet consider     * character and word spacing!     */    public static class Glyph {        public Glyph(TextRenderInfo info, int elementNumber, int index) {            text = info.getText();            ascent = info.getAscentLine();            base = info.getBaseline();            descent = info.getDescentLine();            this.elementNumber = elementNumber;            this.index = index;            this.width = info.getFont().getWidth(text);        }        public final String text;        public final LineSegment ascent;        public final LineSegment base;        public final LineSegment descent;        final int elementNumber;        final int index;        final float width;    }    final PdfLiteral OPERATOR_Tasterisk = new PdfLiteral("T*");    final PdfLiteral OPERATOR_Tc = new PdfLiteral("Tc");    final PdfLiteral OPERATOR_Tw = new PdfLiteral("Tw");    final PdfLiteral OPERATOR_Tj = new PdfLiteral("Tj");    final PdfLiteral OPERATOR_TJ = new PdfLiteral("TJ");    final static List<String> TEXT_SHOWING_OPERATORS = Arrays.asList("Tj", "'", "\"", "TJ");    final static Glyph[] EMPTY_GLYPH_ARRAY = new Glyph[0];}( SimpleTextRemover辅助类)你可以像这样使用它：PdfReader pdfReader = new PdfReader(SOURCE);PdfStamper pdfStamper = new PdfStamper(pdfReader, RESULT_STREAM);SimpleTextRemover remover = new SimpleTextRemover();System.out.printf("\ntest.pdf - Test\n");for (int i = 1; i <= pdfReader.getNumberOfPages(); i++){    System.out.printf("Page %d:\n", i);    List<List<Glyph>> matches = remover.remove(pdfStamper, i, "Test");    for (List<Glyph> match : matches) {        Glyph first = match.get(0);        Vector baseStart = first.base.getStartPoint();        Glyph last = match.get(match.size()-1);        Vector baseEnd = last.base.getEndPoint();        System.out.printf("  Match from (%3.1f %3.1f) to (%3.1f %3.1f)\n", baseStart.get(I1), baseStart.get(I2), baseEnd.get(I1), baseEnd.get(I2));    }}pdfStamper.close();（移除页面文本内容测试testRemoveTestFromTest）我的测试文件有以下控制台输出：test.pdf - TestPage 1:  Match from (134,8 666,9) to (177,8 666,9)  Match from (134,8 642,0) to (153,4 642,0)  Match from (172,8 642,0) to (191,4 642,0)以及输出 PDF 中那些位置缺少“测试”的情况。您可以使用它们在相关位置绘制替换文本，而不是输出匹配坐标。

使用 iText 替换 PDF 文件中的文本

1回答