mirror of https://github.com/fmartingr/dharma.git
wip
This commit is contained in:
parent
9197dd2371
commit
d2a74a7a4e
|
@ -0,0 +1,7 @@
|
|||
package main
|
||||
|
||||
import "fmt"
|
||||
|
||||
func main() {
|
||||
fmt.Println("Hello there!")
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
module github.com/fmartingr/dharma
|
||||
|
||||
go 1.19
|
||||
|
||||
require github.com/gocolly/colly v1.2.0
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.8.0 // indirect
|
||||
github.com/andybalholm/cascadia v1.3.1 // indirect
|
||||
github.com/antchfx/htmlquery v1.2.5 // indirect
|
||||
github.com/antchfx/xmlquery v1.3.12 // indirect
|
||||
github.com/antchfx/xpath v1.2.1 // indirect
|
||||
github.com/gobwas/glob v0.2.3 // indirect
|
||||
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
|
||||
github.com/golang/protobuf v1.5.2 // indirect
|
||||
github.com/kennygrant/sanitize v1.2.4 // indirect
|
||||
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
|
||||
github.com/temoto/robotstxt v1.1.2 // indirect
|
||||
golang.org/x/net v0.0.0-20220809184613-07c6da5e1ced // indirect
|
||||
golang.org/x/text v0.3.7 // indirect
|
||||
google.golang.org/appengine v1.6.7 // indirect
|
||||
google.golang.org/protobuf v1.28.1 // indirect
|
||||
)
|
|
@ -0,0 +1,65 @@
|
|||
github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U=
|
||||
github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI=
|
||||
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
|
||||
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
|
||||
github.com/antchfx/htmlquery v1.2.5 h1:1lXnx46/1wtv1E/kzmH8vrfMuUKYgkdDBA9pIdMJnk4=
|
||||
github.com/antchfx/htmlquery v1.2.5/go.mod h1:2MCVBzYVafPBmKbrmwB9F5xdd+IEgRY61ci2oOsOQVw=
|
||||
github.com/antchfx/xmlquery v1.3.12 h1:6TMGpdjpO/P8VhjnaYPXuqT3qyJ/VsqoyNTmJzNBTQ4=
|
||||
github.com/antchfx/xmlquery v1.3.12/go.mod h1:3w2RvQvTz+DaT5fSgsELkSJcdNgkmg6vuXDEuhdwsPQ=
|
||||
github.com/antchfx/xpath v1.2.1 h1:qhp4EW6aCOVr5XIkT+l6LJ9ck/JsUH/yyauNgTQkBF8=
|
||||
github.com/antchfx/xpath v1.2.1/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
|
||||
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
|
||||
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
|
||||
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
|
||||
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
|
||||
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY=
|
||||
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
|
||||
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
|
||||
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
|
||||
github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
|
||||
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
|
||||
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
|
||||
github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw=
|
||||
github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
|
||||
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
|
||||
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI=
|
||||
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
|
||||
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
|
||||
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
|
||||
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||
golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
|
||||
golang.org/x/net v0.0.0-20220809184613-07c6da5e1ced h1:3dYNDff0VT5xj+mbj2XucFst9WKk6PdGOrb9n+SbIvw=
|
||||
golang.org/x/net v0.0.0-20220809184613-07c6da5e1ced/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
|
||||
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c=
|
||||
google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
|
||||
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
|
||||
google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
|
||||
google.golang.org/protobuf v1.28.1 h1:d0NfwRgPtno5B1Wa6L2DAG+KivqkdutMf1UhdNx175w=
|
||||
google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
|
|
@ -0,0 +1,172 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"log"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/gocolly/colly"
|
||||
)
|
||||
|
||||
// https://git.inter-media.net/gocolly/colly/commit/5cdc2aa8d8d430faee9bb88b9746545cba315a77
|
||||
// RemoveAsciiTabAndNewlines removes the corresponding characters
|
||||
// according to step 3 of https://url.spec.whatwg.org/#concept-basic-url-parser.
|
||||
// Although step 2 says "validation error", this is not a hard error,
|
||||
// and browsers do in fact just silently remove those.
|
||||
//
|
||||
// This function is mostly used internally, but it's exported for extra
|
||||
// convenience.
|
||||
func RemoveAsciiTabAndNewlines(s string) string {
|
||||
return strings.Map(func(r rune) rune {
|
||||
switch r {
|
||||
case '\t', '\n', '\r':
|
||||
return -1
|
||||
default:
|
||||
return r
|
||||
}
|
||||
}, s)
|
||||
}
|
||||
|
||||
func parseAttr(e *colly.HTMLElement, attr string) (*url.URL, error) {
|
||||
href := RemoveAsciiTabAndNewlines(e.Attr(attr))
|
||||
if !strings.HasPrefix(href, "http") {
|
||||
href = e.Request.AbsoluteURL(href)
|
||||
}
|
||||
u, err := url.Parse(href)
|
||||
if err != nil {
|
||||
log.Printf("in: %s", e.Request.URL.String())
|
||||
log.Printf("error parsing %s: %s", attr, err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return u, nil
|
||||
}
|
||||
|
||||
type References struct {
|
||||
urls map[string]map[string]*url.URL
|
||||
urlMu sync.RWMutex
|
||||
}
|
||||
|
||||
func (r *References) From(refLink string) (result []*url.URL) {
|
||||
r.urlMu.RLock()
|
||||
defer r.urlMu.RUnlock()
|
||||
|
||||
ref, exists := r.urls[refLink]
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
|
||||
for _, v := range ref {
|
||||
result = append(result, v)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (r *References) Register(base *url.URL, refLink *url.URL) error {
|
||||
r.urlMu.Lock()
|
||||
defer r.urlMu.Unlock()
|
||||
|
||||
ref, exists := r.urls[refLink.String()]
|
||||
if !exists {
|
||||
r.urls[refLink.String()] = make(map[string]*url.URL)
|
||||
r.urls[refLink.String()][base.String()] = base
|
||||
} else {
|
||||
ref[base.String()] = base
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func NewReferences() *References {
|
||||
return &References{
|
||||
urls: make(map[string]map[string]*url.URL),
|
||||
urlMu: sync.RWMutex{},
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
|
||||
fs := flag.NewFlagSet("dharma", flag.ExitOnError)
|
||||
startUrl := fs.String("url", "", "Url to start analyzing")
|
||||
|
||||
if err := fs.Parse(os.Args); err != nil {
|
||||
log.Println(err)
|
||||
fs.Usage()
|
||||
return
|
||||
}
|
||||
|
||||
if *startUrl == "" {
|
||||
fs.Usage()
|
||||
return
|
||||
}
|
||||
|
||||
errors := make(map[string]int)
|
||||
errorsMu := sync.RWMutex{}
|
||||
references := NewReferences()
|
||||
|
||||
c := colly.NewCollector(
|
||||
// colly.AllowedDomains(hostname),
|
||||
// colly.Async(true),
|
||||
)
|
||||
|
||||
c.OnHTML("[href]", func(e *colly.HTMLElement) {
|
||||
if e.Request.URL.Host != *startUrl {
|
||||
return
|
||||
}
|
||||
|
||||
href, err := parseAttr(e, "href")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
references.Register(e.Request.URL, href)
|
||||
|
||||
e.Request.Visit(e.Attr("href"))
|
||||
})
|
||||
|
||||
c.OnHTML("[src]", func(e *colly.HTMLElement) {
|
||||
if e.Request.URL.Host != *startUrl {
|
||||
return
|
||||
}
|
||||
|
||||
href, err := parseAttr(e, "src")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
references.Register(e.Request.URL, href)
|
||||
|
||||
e.Request.Visit(e.Attr("src"))
|
||||
})
|
||||
|
||||
c.OnResponse(func(r *colly.Response) {
|
||||
// log.Printf("Visited %s: %d", r.Request.URL, r.StatusCode)
|
||||
})
|
||||
|
||||
c.OnError(func(r *colly.Response, e error) {
|
||||
log.Printf("%d error %s: %s ", r.StatusCode, r.Request.URL.String(), e)
|
||||
errorsMu.Lock()
|
||||
errors[r.Request.URL.String()] = r.StatusCode
|
||||
errorsMu.Unlock()
|
||||
})
|
||||
|
||||
c.Limit(&colly.LimitRule{Parallelism: 2})
|
||||
c.SetRequestTimeout(10 * time.Second)
|
||||
|
||||
c.Visit(*startUrl)
|
||||
|
||||
c.Wait()
|
||||
|
||||
for errUrl, statusCode := range errors {
|
||||
log.Printf("[%d] %s", statusCode, errUrl)
|
||||
|
||||
parsedURL, _ := url.Parse(errUrl)
|
||||
log.Println(" Found in:")
|
||||
for _, r := range references.From(parsedURL.String()) {
|
||||
log.Printf(" - %s", r)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
found
|
|
@ -0,0 +1,23 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Document</title>
|
||||
<link rel="stylesheet" href="/static/style.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<a href="/found.html">found.html</a> <br>
|
||||
<a href="/not_found.html">not_found.html</a> <br>
|
||||
<a href="/rel/index.html">rel.html</a>
|
||||
<a href="https://fmartingr.com">External link (ok)</a> <br>
|
||||
<a href="http://fmartingr.com">External link (ok - http redir)</a> <br>
|
||||
<a href="https://www.e3H7iaV685rbH7R5lBNxgpietP7JTnMeknmi9SNAEUT4XSiH2sET6ixAcjhy4CAi.com">External link (nack)</a>
|
||||
<br>
|
||||
<script src="/static/script.js" type="text/javascript"></script>
|
||||
</body>
|
||||
|
||||
</html>
|
|
@ -0,0 +1,22 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Document</title>
|
||||
<link rel="stylesheet" href="./style.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<a href="./relfound.html">found.html</a> <br>
|
||||
<a href="./rel_not_found.html">not_found.html</a> <br>
|
||||
<img src="./image.jpg"> <br>
|
||||
<img src="./image-404.jpg"> <br>
|
||||
<img src="./e3H7iaV685rbH7R5lBNxgpietP7JTnMeknmi9SNAEUT4XSiH2sET6ixAcjhy4CAi"> <br>
|
||||
<br>
|
||||
<script src="/static/script.js" type="text/javascript"></script>
|
||||
</body>
|
||||
|
||||
</html>
|
|
@ -0,0 +1 @@
|
|||
found
|
Loading…
Reference in New Issue