在Golang中提取*html.Node的位置偏移

如何为已解析的 HTML 文档的特定节点提取位置偏移量?例如,对于文档,<div>Hello, <b>World!</b></div>我希望能够知道World!is 的偏移量15:21。解析时可能会更改文档。


我有一个用特殊标记渲染整个文档的解决方案,但这对性能来说真的很糟糕。有任何想法吗?


package main


import (

    "bytes"

    "golang.org/x/net/html"

    "golang.org/x/net/html/atom"

    "log"

    "strings"

)


func nodeIndexOffset(context *html.Node, node *html.Node) (int, int) {

    if node.Type != html.TextNode {

        node = node.FirstChild

    }

    originalData := node.Data


    var buf bytes.Buffer

    node.Data = "|start|" + originalData

    _ = html.Render(&buf, context.FirstChild)

    start := strings.Index(buf.String(), "|start|")


    buf = bytes.Buffer{}

    node.Data = originalData + "|end|"

    _ = html.Render(&buf, context.FirstChild)

    end := strings.Index(buf.String(), "|end|")


    node.Data = originalData

    return start, end

}


func main() {

    s := "<div>Hello, <b>World!</b></div>"

    var context html.Node

    context = html.Node{

        Type:     html.ElementNode,

        Data:     "body",

        DataAtom: atom.Body,

    }

    nodes, err := html.ParseFragment(strings.NewReader(s), &context)

    if err != nil {

        log.Fatal(err)

    }

    for _, node := range nodes {

        context.AppendChild(node)

    }

    world := nodes[0].FirstChild.NextSibling.FirstChild

    log.Println("target", world)

    log.Println(nodeIndexOffset(&context, world))

}


富国沪深
浏览 320回答 2
2回答

波斯汪

不是答案,但评论太长了。以下可能在某种程度上起作用:使用 aTokenizer并一步一步地遍历每个元素。将您的输入包装到自定义读取器中,该读取器在 Tokenizer 读取时记录行和列偏移量。在调用 Next() 之前和之后查询您的自定义阅读器的位置,以记录您需要的大致位置信息。这有点痛苦,而且不太准确,但可能是您能做的最好的。

智慧大石

我想出了一个解决方案,我们扩展(如果有另一种方法,请修复我)原始 HTML 包以及custom.go带有新导出功能的附加文件。此函数能够访问 的未导出data属性Tokenizer,该属性准确保存当前 的开始和结束位置Node。我们必须在每次读取缓冲区后调整位置。见globalBufDif。我真的不喜欢我只需要 fork 包才能访问几个属性,但似乎这是一种 Go 方式。func parseWithIndexes(p *parser) (map[*Node][2]int, error) {&nbsp; &nbsp; // Iterate until EOF. Any other error will cause an early return.&nbsp; &nbsp; var err error&nbsp; &nbsp; var globalBufDif int&nbsp; &nbsp; var prevEndBuf int&nbsp; &nbsp; var tokenIndex [2]int&nbsp; &nbsp; tokenMap := make(map[*Node][2]int)&nbsp; &nbsp; for err != io.EOF {&nbsp; &nbsp; &nbsp; &nbsp; // CDATA sections are allowed only in foreign content.&nbsp; &nbsp; &nbsp; &nbsp; n := p.oe.top()&nbsp; &nbsp; &nbsp; &nbsp; p.tokenizer.AllowCDATA(n != nil && n.Namespace != "")&nbsp; &nbsp; &nbsp; &nbsp; t := p.top().FirstChild&nbsp; &nbsp; &nbsp; &nbsp; for {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if t != nil && t.NextSibling != nil {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; t = t.NextSibling&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; } else {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; break&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; tokenMap[t] = tokenIndex&nbsp; &nbsp; &nbsp; &nbsp; if prevEndBuf > p.tokenizer.data.end {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; globalBufDif += prevEndBuf&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; prevEndBuf = p.tokenizer.data.end&nbsp; &nbsp; &nbsp; &nbsp; // Read and parse the next token.&nbsp; &nbsp; &nbsp; &nbsp; p.tokenizer.Next()&nbsp; &nbsp; &nbsp; &nbsp; tokenIndex = [2]int{p.tokenizer.data.start + globalBufDif, p.tokenizer.data.end + globalBufDif}&nbsp; &nbsp; &nbsp; &nbsp; p.tok = p.tokenizer.Token()&nbsp; &nbsp; &nbsp; &nbsp; if p.tok.Type == ErrorToken {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; err = p.tokenizer.Err()&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; if err != nil && err != io.EOF {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; return tokenMap, err&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; p.parseCurrentToken()&nbsp; &nbsp; }&nbsp; &nbsp; return tokenMap, nil}// ParseFragmentWithIndexes parses a fragment of HTML and returns the nodes// that were found. If the fragment is the InnerHTML for an existing element,// pass that element in context.func ParseFragmentWithIndexes(r io.Reader, context *Node) ([]*Node, map[*Node][2]int, error) {&nbsp; &nbsp; contextTag := ""&nbsp; &nbsp; if context != nil {&nbsp; &nbsp; &nbsp; &nbsp; if context.Type != ElementNode {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; return nil, nil, errors.New("html: ParseFragment of non-element Node")&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; // The next check isn't just context.DataAtom.String() == context.Data because&nbsp; &nbsp; &nbsp; &nbsp; // it is valid to pass an element whose tag isn't a known atom. For example,&nbsp; &nbsp; &nbsp; &nbsp; // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent.&nbsp; &nbsp; &nbsp; &nbsp; if context.DataAtom != a.Lookup([]byte(context.Data)) {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; return nil, nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data)&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; &nbsp; &nbsp; contextTag = context.DataAtom.String()&nbsp; &nbsp; }&nbsp; &nbsp; p := &parser{&nbsp; &nbsp; &nbsp; &nbsp; tokenizer: NewTokenizerFragment(r, contextTag),&nbsp; &nbsp; &nbsp; &nbsp; doc: &Node{&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; Type: DocumentNode,&nbsp; &nbsp; &nbsp; &nbsp; },&nbsp; &nbsp; &nbsp; &nbsp; scripting: true,&nbsp; &nbsp; &nbsp; &nbsp; fragment:&nbsp; true,&nbsp; &nbsp; &nbsp; &nbsp; context:&nbsp; &nbsp;context,&nbsp; &nbsp; }&nbsp; &nbsp; root := &Node{&nbsp; &nbsp; &nbsp; &nbsp; Type:&nbsp; &nbsp; &nbsp;ElementNode,&nbsp; &nbsp; &nbsp; &nbsp; DataAtom: a.Html,&nbsp; &nbsp; &nbsp; &nbsp; Data:&nbsp; &nbsp; &nbsp;a.Html.String(),&nbsp; &nbsp; }&nbsp; &nbsp; p.doc.AppendChild(root)&nbsp; &nbsp; p.oe = nodeStack{root}&nbsp; &nbsp; p.resetInsertionMode()&nbsp; &nbsp; for n := context; n != nil; n = n.Parent {&nbsp; &nbsp; &nbsp; &nbsp; if n.Type == ElementNode && n.DataAtom == a.Form {&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; p.form = n&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; break&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp; &nbsp; }&nbsp; &nbsp; tokenMap, err := parseWithIndexes(p)&nbsp; &nbsp; if err != nil {&nbsp; &nbsp; &nbsp; &nbsp; return nil, nil, err&nbsp; &nbsp; }&nbsp; &nbsp; parent := p.doc&nbsp; &nbsp; if context != nil {&nbsp; &nbsp; &nbsp; &nbsp; parent = root&nbsp; &nbsp; }&nbsp; &nbsp; var result []*Node&nbsp; &nbsp; for c := parent.FirstChild; c != nil; {&nbsp; &nbsp; &nbsp; &nbsp; next := c.NextSibling&nbsp; &nbsp; &nbsp; &nbsp; parent.RemoveChild(c)&nbsp; &nbsp; &nbsp; &nbsp; result = append(result, c)&nbsp; &nbsp; &nbsp; &nbsp; c = next&nbsp; &nbsp; }&nbsp; &nbsp; return result, tokenMap, nil}
打开App,查看更多内容
随时随地看视频慕课网APP

相关分类

Go