diff --git a/go.mod b/go.mod deleted file mode 100644 index ce6a6a2..0000000 --- a/go.mod +++ /dev/null @@ -1,5 +0,0 @@ -module git.nkpl.cc/twocookedfaggots/scraper - -go 1.25.5 - -require golang.org/x/net v0.50.0 diff --git a/go.sum b/go.sum deleted file mode 100644 index 13ce0c7..0000000 --- a/go.sum +++ /dev/null @@ -1,2 +0,0 @@ -golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60= -golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM= diff --git a/match.go b/match.go index 529853d..2ce6f36 100644 --- a/match.go +++ b/match.go @@ -1,3 +1,4 @@ +//go:build ignore package scraper import ( @@ -17,7 +18,7 @@ var body = &html.Node{ var ErrNotAnElementNode = errors.New("not a html.ElementNode") -func parseFragment(s string) (*html.Node, error) { +func ParseFragment(s string) (*html.Node, error) { n, err := html.ParseFragment(strings.NewReader(s), body) if err != nil { return nil, err @@ -33,13 +34,13 @@ func MatchElemAttr(s string, n *html.Node) (bool, error) { if n1.Type != html.ElementNode { return false, errors.Join( - ErrNotAnElementNode, + ErrNotAnElementNode, errors.New("s isn's a html.ElementNode"), - ) + ) } if n.Type != html.ElementNode { return false, errors.Join( - ErrNotAnElementNode, + ErrNotAnElementNode, errors.New("n isn's a html.ElementNode"), ) } @@ -51,7 +52,6 @@ func MatchElemAttr(s string, n *html.Node) (bool, error) { return false, nil } - func SearchElemAttr2(s string, n2 *html.Node) (chan *html.Node, error) { ch := make(chan *html.Node) var crawl func(*html.Node) diff --git a/match_test.go b/match_test.go index 1e1111b..fb46fd9 100644 --- a/match_test.go +++ b/match_test.go @@ -1,3 +1,4 @@ +//go:build ignore package scraper import ( @@ -24,3 +25,5 @@ func TestMatchElemAttr(t *testing.T) { t.Fail() } } + + diff --git a/scraper.go b/search.go similarity index 82% rename from scraper.go rename to search.go index dfdbbb2..62cd47b 100644 --- a/scraper.go +++ b/search.go @@ -42,7 +42,12 @@ func SearchAttr(n *html.Node, key, contains string) chan *html.Node { crawl = func(n *html.Node) { for _, a := range n.Attr { if a.Key == key && - containsWord(a.Val, contains) { + func() bool { + if contains == "" { + return true + } + return containsWord(a.Val, contains) + }() { ch <- n } } @@ -81,11 +86,11 @@ func CrawlText(n *html.Node) string { if n.Type == html.TextNode { Must( s.WriteString(func() string { - trimmed := strings.TrimSpace(n.Data) + trimmed := strings.TrimSpace(n.Data) if n.Data == "" { return trimmed - } - return trimmed+"\n" + } + return trimmed + "\n" }()), ) } @@ -98,3 +103,12 @@ func CrawlText(n *html.Node) string { crawl(n) return s.String() } + +func ExtractAttrValue(n *html.Node, key string) string { + for _, a := range n.Attr { + if a.Key == key { + return a.Val + } + } + return "" +} diff --git a/search_node.go b/search_node.go new file mode 100644 index 0000000..c2abb30 --- /dev/null +++ b/search_node.go @@ -0,0 +1,40 @@ +package scraper + +import "golang.org/x/net/html" + +func MatchNode(n1 *html.Node, n2 *html.Node) bool { + if n1.Data == n2.Data { + if len(n1.Attr) == 0 || + len(n2.Attr) == 0 { + return false + } + for i, a1 := range n1.Attr { + a2 := n2.Attr[i] + return a1.Key == a2.Key + } + } + return false +} + +func SearchNode(target *html.Node, n *html.Node) chan *html.Node { + ch := make(chan *html.Node) + var crawl crawlFunc + crawl = func(n *html.Node) { + if n.Type == html.ElementNode { + if MatchNode(target, n) { + ch <- n + } + } + c := n.FirstChild + for c != nil { + crawl(c) + c = c.NextSibling + } + } + go func() { + defer close(ch) + crawl(n) + }() + return ch +} + diff --git a/search_node_test.go b/search_node_test.go new file mode 100644 index 0000000..c74fbd8 --- /dev/null +++ b/search_node_test.go @@ -0,0 +1,48 @@ +package scraper + +import ( + "strings" + "testing" + + "golang.org/x/net/html" +) + +/* +var body = &html.Node{ + Type: html.ElementNode, + Data: "body", + DataAtom: atom.Body, +} +*/ + +const fragment = `
something
` + +func TestSearchNode(t *testing.T) { + doc1, err := html.ParseFragment(strings.NewReader(fragment), body) + if err != nil { + t.Error(err) + } + n1 := doc1[0] + doc2, err := html.ParseFragment(strings.NewReader(fragment), body) + if err != nil { + t.Error(err) + } + n2 := doc2[0] + ch := SearchNode(n1, n2) + t.Log(CrawlText(<-ch)) +} + + +func TestMatchNode(t *testing.T) { + doc1, err := html.ParseFragment(strings.NewReader(fragment), body) + if err != nil { + t.Error(err) + } + n1 := doc1[0] + doc2, err := html.ParseFragment(strings.NewReader(fragment), body) + if err != nil { + t.Error(err) + } + n2 := doc2[0] + t.Log(MatchNode(n1, n2)) +} diff --git a/scraper_test.go b/search_test.go similarity index 96% rename from scraper_test.go rename to search_test.go index f1903ad..60d4598 100644 --- a/scraper_test.go +++ b/search_test.go @@ -10,12 +10,14 @@ import ( const htmlStr = `
HelloWorld
` +/* func Must[T any](v T, err error) T { if err != nil { panic(err) } return v } +*/ func TestSearchElem(t *testing.T) { doc := Must( @@ -65,7 +67,7 @@ func TestSearchElemAttr(t *testing.T) { } } -func TestCrawlTest(t *testing.T) { +func TestCrawlText(t *testing.T) { fmt.Println( CrawlText(Must( html.Parse(strings.NewReader(htmlStr)), diff --git a/util.go b/util.go index 761b058..c589b02 100644 --- a/util.go +++ b/util.go @@ -4,6 +4,9 @@ import ( "fmt" "regexp" "strings" + + "golang.org/x/net/html" + "golang.org/x/net/html/atom" ) func containsWord(s, substr string) (match bool) { @@ -22,3 +25,17 @@ func containsWord(s, substr string) (match bool) { } return } + +var body = &html.Node{ + Type: html.ElementNode, + Data: "body", + DataAtom: atom.Body, +} + +func ParseFragment(s string) (*html.Node, error) { + n, err := html.ParseFragment(strings.NewReader(s), body) + if err != nil { + return nil, err + } + return n[0], nil +}