mirror of https://github.com/fmartingr/dharma.git
173 lines
3.5 KiB
Go
173 lines
3.5 KiB
Go
package main
|
|
|
|
import (
|
|
"flag"
|
|
"log"
|
|
"net/url"
|
|
"os"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/gocolly/colly"
|
|
)
|
|
|
|
// https://git.inter-media.net/gocolly/colly/commit/5cdc2aa8d8d430faee9bb88b9746545cba315a77
|
|
// RemoveAsciiTabAndNewlines removes the corresponding characters
|
|
// according to step 3 of https://url.spec.whatwg.org/#concept-basic-url-parser.
|
|
// Although step 2 says "validation error", this is not a hard error,
|
|
// and browsers do in fact just silently remove those.
|
|
//
|
|
// This function is mostly used internally, but it's exported for extra
|
|
// convenience.
|
|
func RemoveAsciiTabAndNewlines(s string) string {
|
|
return strings.Map(func(r rune) rune {
|
|
switch r {
|
|
case '\t', '\n', '\r':
|
|
return -1
|
|
default:
|
|
return r
|
|
}
|
|
}, s)
|
|
}
|
|
|
|
func parseAttr(e *colly.HTMLElement, attr string) (*url.URL, error) {
|
|
href := RemoveAsciiTabAndNewlines(e.Attr(attr))
|
|
if !strings.HasPrefix(href, "http") {
|
|
href = e.Request.AbsoluteURL(href)
|
|
}
|
|
u, err := url.Parse(href)
|
|
if err != nil {
|
|
log.Printf("in: %s", e.Request.URL.String())
|
|
log.Printf("error parsing %s: %s", attr, err)
|
|
return nil, err
|
|
}
|
|
|
|
return u, nil
|
|
}
|
|
|
|
type References struct {
|
|
urls map[string]map[string]*url.URL
|
|
urlMu sync.RWMutex
|
|
}
|
|
|
|
func (r *References) From(refLink string) (result []*url.URL) {
|
|
r.urlMu.RLock()
|
|
defer r.urlMu.RUnlock()
|
|
|
|
ref, exists := r.urls[refLink]
|
|
if !exists {
|
|
return
|
|
}
|
|
|
|
for _, v := range ref {
|
|
result = append(result, v)
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func (r *References) Register(base *url.URL, refLink *url.URL) error {
|
|
r.urlMu.Lock()
|
|
defer r.urlMu.Unlock()
|
|
|
|
ref, exists := r.urls[refLink.String()]
|
|
if !exists {
|
|
r.urls[refLink.String()] = make(map[string]*url.URL)
|
|
r.urls[refLink.String()][base.String()] = base
|
|
} else {
|
|
ref[base.String()] = base
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func NewReferences() *References {
|
|
return &References{
|
|
urls: make(map[string]map[string]*url.URL),
|
|
urlMu: sync.RWMutex{},
|
|
}
|
|
}
|
|
|
|
func main() {
|
|
|
|
fs := flag.NewFlagSet("dharma", flag.ExitOnError)
|
|
startUrl := fs.String("url", "", "Url to start analyzing")
|
|
|
|
if err := fs.Parse(os.Args); err != nil {
|
|
log.Println(err)
|
|
fs.Usage()
|
|
return
|
|
}
|
|
|
|
if *startUrl == "" {
|
|
fs.Usage()
|
|
return
|
|
}
|
|
|
|
errors := make(map[string]int)
|
|
errorsMu := sync.RWMutex{}
|
|
references := NewReferences()
|
|
|
|
c := colly.NewCollector(
|
|
// colly.AllowedDomains(hostname),
|
|
// colly.Async(true),
|
|
)
|
|
|
|
c.OnHTML("[href]", func(e *colly.HTMLElement) {
|
|
if e.Request.URL.Host != *startUrl {
|
|
return
|
|
}
|
|
|
|
href, err := parseAttr(e, "href")
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
references.Register(e.Request.URL, href)
|
|
|
|
e.Request.Visit(e.Attr("href"))
|
|
})
|
|
|
|
c.OnHTML("[src]", func(e *colly.HTMLElement) {
|
|
if e.Request.URL.Host != *startUrl {
|
|
return
|
|
}
|
|
|
|
href, err := parseAttr(e, "src")
|
|
if err != nil {
|
|
return
|
|
}
|
|
references.Register(e.Request.URL, href)
|
|
|
|
e.Request.Visit(e.Attr("src"))
|
|
})
|
|
|
|
c.OnResponse(func(r *colly.Response) {
|
|
// log.Printf("Visited %s: %d", r.Request.URL, r.StatusCode)
|
|
})
|
|
|
|
c.OnError(func(r *colly.Response, e error) {
|
|
log.Printf("%d error %s: %s ", r.StatusCode, r.Request.URL.String(), e)
|
|
errorsMu.Lock()
|
|
errors[r.Request.URL.String()] = r.StatusCode
|
|
errorsMu.Unlock()
|
|
})
|
|
|
|
c.Limit(&colly.LimitRule{Parallelism: 2})
|
|
c.SetRequestTimeout(10 * time.Second)
|
|
|
|
c.Visit(*startUrl)
|
|
|
|
c.Wait()
|
|
|
|
for errUrl, statusCode := range errors {
|
|
log.Printf("[%d] %s", statusCode, errUrl)
|
|
|
|
parsedURL, _ := url.Parse(errUrl)
|
|
log.Println(" Found in:")
|
|
for _, r := range references.From(parsedURL.String()) {
|
|
log.Printf(" - %s", r)
|
|
}
|
|
}
|
|
}
|