Add initial archiver

This commit is contained in:
Radhi Fadlillah 2019-06-09 14:54:07 +07:00
parent 95c8717855
commit 4e38387170
16 changed files with 1524 additions and 38 deletions

4
go.mod
View File

@ -20,9 +20,11 @@ require (
github.com/shurcooL/vfsgen v0.0.0-20181202132449-6a9ea43bcacd
github.com/sirupsen/logrus v1.4.2
github.com/spf13/cobra v0.0.4
github.com/tdewolff/parse/v2 v2.3.7
go.etcd.io/bbolt v1.3.2
golang.org/x/crypto v0.0.0-20190513172903-22d7a77e9e5f
golang.org/x/image v0.0.0-20190523035834-f03afa92d3ff // indirect
golang.org/x/net v0.0.0-20190522155817-f3200d17e092 // indirect
golang.org/x/net v0.0.0-20190522155817-f3200d17e092
golang.org/x/sys v0.0.0-20190526052359-791d8a0f4d09 // indirect
golang.org/x/tools v0.0.0-20190525145741-7be61e1b0e51 // indirect
google.golang.org/appengine v1.6.0 // indirect

6
go.sum
View File

@ -80,8 +80,14 @@ github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/tdewolff/parse/v2 v2.3.7 h1:DXoTUgrUE2Eap0m7zg1ljCO5C78vhEi7HTc4YnJWrRk=
github.com/tdewolff/parse/v2 v2.3.7/go.mod h1:HansaqmN4I/U7L6/tUp0NcwT2tFO0F4EAWYGSDzkYNk=
github.com/tdewolff/test v1.0.0 h1:jOwzqCXr5ePXEPGJaq2ivoR6HOCi+D5TPfpoyg8yvmU=
github.com/tdewolff/test v1.0.0/go.mod h1:DiQUlutnqlEvdvhSn2LPGy4TFwRauAaYDsL+683RNX4=
github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
go.etcd.io/bbolt v1.3.2 h1:Z/90sZLPOeCy2PwprqkFa25PdkusRzaj9P8zm/KNyvk=
go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU=
golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 h1:VklqNMn3ovrHsnt90PveolxSbWFaJdECFbxSq0Mqo2M=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=

View File

@ -1,12 +1,17 @@
package cmd
import (
"bytes"
"fmt"
"io"
"net/http"
nurl "net/url"
fp "path/filepath"
"strings"
"time"
"github.com/go-shiori/shiori/pkg/warc"
"github.com/go-shiori/go-readability"
"github.com/go-shiori/shiori/internal/model"
"github.com/spf13/cobra"
@ -73,14 +78,36 @@ func addHandler(cmd *cobra.Command, args []string) {
func() {
cInfo.Println("Downloading article...")
resp, err := httpClient.Get(url)
// Prepare request
req, err := http.NewRequest("GET", url, nil)
if err != nil {
cError.Printf("Failed to download article: %v\n", err)
return
}
// Send request
req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)")
resp, err := httpClient.Do(req)
if err != nil {
cError.Printf("Failed to download article: %v\n", err)
return
}
defer resp.Body.Close()
article, err := readability.FromReader(resp.Body, url)
// Save as archive
buffer := bytes.NewBuffer(nil)
tee := io.TeeReader(resp.Body, buffer)
contentType := resp.Header.Get("Content-Type")
archivePath := fp.Join(DataDir, "archive", fmt.Sprintf("%d", book.ID))
err = warc.FromReader(tee, url, contentType, archivePath)
if err != nil {
cError.Printf("Failed to create archive: %v\n", err)
return
}
// Parse article
article, err := readability.FromReader(buffer, url)
if err != nil {
cError.Printf("Failed to parse article: %v\n", err)
return

View File

@ -4,6 +4,7 @@ import (
"fmt"
"os"
fp "path/filepath"
"strconv"
"strings"
"github.com/spf13/cobra"
@ -57,18 +58,20 @@ func deleteHandler(cmd *cobra.Command, args []string) {
return
}
// Delete thumbnail image from local disk
// Delete thumbnail image and archives from local disk
if len(ids) == 0 {
thumbDir := fp.Join(DataDir, "thumb")
archiveDir := fp.Join(DataDir, "archive")
os.RemoveAll(thumbDir)
os.RemoveAll(archiveDir)
} else {
for _, id := range ids {
imgPath := fp.Join(DataDir, "thumb", fmt.Sprintf("%d.*", id))
matchedFiles, _ := fp.Glob(imgPath)
strID := strconv.Itoa(id)
imgPath := fp.Join(DataDir, "thumb", strID)
archivePath := fp.Join(DataDir, "archive", strID)
for _, f := range matchedFiles {
os.Remove(f)
}
os.Remove(imgPath)
os.Remove(archivePath)
}
}

View File

@ -2,9 +2,15 @@ package cmd
import (
"fmt"
"net"
"net/http"
fp "path/filepath"
"strconv"
"strings"
"github.com/go-shiori/shiori/internal/database"
"github.com/go-shiori/shiori/pkg/warc"
"github.com/julienschmidt/httprouter"
"github.com/spf13/cobra"
)
@ -20,6 +26,7 @@ func openCmd() *cobra.Command {
}
cmd.Flags().BoolP("yes", "y", false, "Skip confirmation prompt and open ALL bookmarks")
cmd.Flags().BoolP("archive", "a", false, "Open the bookmark's archived content")
cmd.Flags().BoolP("text-cache", "t", false, "Open the bookmark's text cache in terminal")
return cmd
@ -28,8 +35,22 @@ func openCmd() *cobra.Command {
func openHandler(cmd *cobra.Command, args []string) {
// Parse flags
skipConfirm, _ := cmd.Flags().GetBool("yes")
archiveMode, _ := cmd.Flags().GetBool("archive")
textCacheMode, _ := cmd.Flags().GetBool("text-cache")
// Convert args to ids
ids, err := parseStrIndices(args)
if err != nil {
cError.Println(err)
return
}
// If in archive mode, only one bookmark allowed
if len(ids) > 1 && archiveMode {
cError.Println("In archive mode, only one bookmark allowed")
return
}
// If no arguments (i.e all bookmarks will be opened),
// confirm to user
if len(args) == 0 && !skipConfirm {
@ -42,13 +63,6 @@ func openHandler(cmd *cobra.Command, args []string) {
}
}
// Convert args to ids
ids, err := parseStrIndices(args)
if err != nil {
cError.Println(err)
return
}
// Read bookmarks from database
getOptions := database.GetBookmarksOptions{
IDs: ids,
@ -62,17 +76,16 @@ func openHandler(cmd *cobra.Command, args []string) {
}
if len(bookmarks) == 0 {
switch {
case len(ids) > 0:
if len(ids) > 0 {
cError.Println("No matching index found")
default:
} else {
cError.Println("No bookmarks saved yet")
}
return
}
// If not text cache mode, open bookmarks in browser
if !textCacheMode {
// If not text cache mode nor archive mode, open bookmarks in browser
if !textCacheMode && !archiveMode {
for _, book := range bookmarks {
err = openBrowser(book.URL)
if err != nil {
@ -83,22 +96,74 @@ func openHandler(cmd *cobra.Command, args []string) {
}
// Show bookmarks content in terminal
termWidth := getTerminalWidth()
if textCacheMode {
termWidth := getTerminalWidth()
for _, book := range bookmarks {
cIndex.Printf("%d. ", book.ID)
cTitle.Println(book.Title)
fmt.Println()
for _, book := range bookmarks {
cIndex.Printf("%d. ", book.ID)
cTitle.Println(book.Title)
fmt.Println()
if book.Content == "" {
cError.Println("This bookmark doesn't have any cached content")
} else {
book.Content = strings.Join(strings.Fields(book.Content), " ")
fmt.Println(book.Content)
if book.Content == "" {
cError.Println("This bookmark doesn't have any cached content")
} else {
book.Content = strings.Join(strings.Fields(book.Content), " ")
fmt.Println(book.Content)
}
fmt.Println()
cSymbol.Println(strings.Repeat("=", termWidth))
fmt.Println()
}
}
// Open archive
id := strconv.Itoa(bookmarks[0].ID)
archivePath := fp.Join(DataDir, "archive", id)
archive, err := warc.Open(archivePath)
if err != nil {
cError.Printf("Failed to open archive: %v\n", err)
return
}
defer archive.Close()
// Create simple server
router := httprouter.New()
router.GET("/*filename", func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
filename := ps.ByName("filename")
resourceName := fp.Base(filename)
if resourceName == "/" {
resourceName = ""
}
fmt.Println()
cSymbol.Println(strings.Repeat("=", termWidth))
fmt.Println()
content, contentType, err := archive.Read(resourceName)
if err != nil {
panic(err)
}
w.Header().Set("Content-Type", contentType)
if _, err = w.Write(content); err != nil {
panic(err)
}
})
router.PanicHandler = func(w http.ResponseWriter, r *http.Request, arg interface{}) {
http.Error(w, fmt.Sprint(arg), 500)
}
// Choose random port
listener, err := net.Listen("tcp", ":0")
if err != nil {
cError.Printf("Failed to serve archive: %v\n", err)
return
}
portNumber := listener.Addr().(*net.TCPAddr).Port
cInfo.Printf("Archive served in http://localhost:%d\n", portNumber)
err = http.Serve(listener, router)
if err != nil {
cError.Printf("Failed to serve archive: %v\n", err)
}
}

View File

@ -1,7 +1,9 @@
package cmd
import (
"crypto/tls"
"net/http"
"net/http/cookiejar"
"time"
"github.com/go-shiori/shiori/internal/database"
@ -15,9 +17,22 @@ var (
// DataDir is directory for downloaded data
DataDir string
httpClient = &http.Client{Timeout: time.Minute}
httpClient *http.Client
)
func init() {
jar, _ := cookiejar.New(nil)
httpClient = &http.Client{
Timeout: time.Minute,
Transport: &http.Transport{
TLSClientConfig: &tls.Config{
InsecureSkipVerify: true,
},
},
Jar: jar,
}
}
// ShioriCmd returns the root command for shiori
func ShioriCmd() *cobra.Command {
rootCmd := &cobra.Command{

View File

@ -1,7 +1,10 @@
package cmd
import (
"bytes"
"fmt"
"io"
"net/http"
nurl "net/url"
fp "path/filepath"
"sort"
@ -12,6 +15,7 @@ import (
"github.com/go-shiori/go-readability"
"github.com/go-shiori/shiori/internal/database"
"github.com/go-shiori/shiori/internal/model"
"github.com/go-shiori/shiori/pkg/warc"
"github.com/spf13/cobra"
)
@ -139,8 +143,17 @@ func updateHandler(cmd *cobra.Command, args []string) {
<-semaphore
}()
// Download article
resp, err := httpClient.Get(book.URL)
// Prepare request
req, err := http.NewRequest("GET", book.URL, nil)
if err != nil {
chProblem <- book.ID
chMessage <- fmt.Errorf("Failed to download %s: %v", book.URL, err)
return
}
// Send request
req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)")
resp, err := httpClient.Do(req)
if err != nil {
chProblem <- book.ID
chMessage <- fmt.Errorf("Failed to download %s: %v", book.URL, err)
@ -148,7 +161,21 @@ func updateHandler(cmd *cobra.Command, args []string) {
}
defer resp.Body.Close()
article, err := readability.FromReader(resp.Body, book.URL)
// Save as archive
buffer := bytes.NewBuffer(nil)
tee := io.TeeReader(resp.Body, buffer)
contentType := resp.Header.Get("Content-Type")
archivePath := fp.Join(DataDir, "archive", fmt.Sprintf("%d", book.ID))
err = warc.FromReader(tee, book.URL, contentType, archivePath)
if err != nil {
chProblem <- book.ID
chMessage <- fmt.Errorf("Failed to create archive %s: %v", book.URL, err)
return
}
// Parse article
article, err := readability.FromReader(buffer, book.URL)
if err != nil {
chProblem <- book.ID
chMessage <- fmt.Errorf("Failed to parse %s: %v", book.URL, err)

View File

@ -0,0 +1,173 @@
package archiver
import (
"fmt"
"strings"
"sync"
"time"
"go.etcd.io/bbolt"
)
// Archiver is struct for archiving an URL and its resources.
type Archiver struct {
sync.RWMutex
sync.WaitGroup
DB *bbolt.DB
ChDone chan struct{}
ChErrors chan error
ChWarnings chan error
ChRequest chan ResourceURL
ResourceMap map[string]struct{}
LogEnabled bool
}
// Close closes channels that used by the Archiver.
func (arc *Archiver) Close() {
close(arc.ChErrors)
close(arc.ChWarnings)
close(arc.ChRequest)
}
// StartArchiver starts the archival process.
func (arc *Archiver) StartArchiver() []error {
go func() {
time.Sleep(time.Second)
arc.Wait()
close(arc.ChDone)
}()
// Download the URL concurrently.
// After download finished, parse response to extract resources
// URL inside it. After that, send it to channel to download again.
errors := make([]error, 0)
warnings := make([]error, 0)
func() {
for {
select {
case <-arc.ChDone:
return
case err := <-arc.ChErrors:
errors = append(errors, err)
case err := <-arc.ChWarnings:
warnings = append(warnings, err)
case res := <-arc.ChRequest:
arc.RLock()
_, exist := arc.ResourceMap[res.DownloadURL]
arc.RUnlock()
if !exist {
arc.Add(1)
go arc.archive(res)
}
}
}
}()
// Print log message if required
if arc.LogEnabled {
nErrors := len(errors)
nWarnings := len(warnings)
arc.Logf(infoLog, "Download finished with %d warnings and %d errors\n", nWarnings, nErrors)
if nWarnings > 0 {
fmt.Println()
for _, warning := range warnings {
arc.Log(warningLog, warning)
}
}
if nErrors > 0 {
for _, err := range errors {
arc.Log(errorLog, err)
}
}
}
return nil
}
// archive downloads a subresource and save it to storage.
func (arc *Archiver) archive(res ResourceURL) {
// Make sure to decrease wait group once finished
defer arc.Done()
// Download resource
resp, err := DownloadData(res.DownloadURL)
if err != nil {
arc.ChErrors <- fmt.Errorf("failed to download %s: %v", res.DownloadURL, err)
return
}
defer resp.Body.Close()
// Process resource depending on its type.
// Since this `archive` method only used for processing sub
// resource, we will only process the CSS sub resources.
// For other file, we will simply download it as it is.
var result ProcessResult
var subResources []ResourceURL
cType := resp.Header.Get("Content-Type")
switch {
case strings.Contains(cType, "text/css"):
result, subResources, err = arc.ProcessCSSFile(res, resp.Body)
default:
result, err = arc.ProcessOtherFile(res, resp.Body)
}
if err != nil {
arc.ChErrors <- fmt.Errorf("failed to process %s: %v", res.DownloadURL, err)
return
}
// Add this url to resource map
arc.Lock()
arc.ResourceMap[res.DownloadURL] = struct{}{}
arc.Unlock()
// Save content to storage
arc.Logf(infoLog, "Downloaded %s, parent %s", res.DownloadURL, res.Parent)
result.ContentType = cType
err = arc.SaveToStorage(result)
if err != nil {
arc.ChErrors <- fmt.Errorf("failed to save %s: %v", res.DownloadURL, err)
return
}
// Send sub resource to request channel
for _, subRes := range subResources {
arc.ChRequest <- subRes
}
}
// SaveToStorage save processing result to storage.
func (arc *Archiver) SaveToStorage(result ProcessResult) error {
err := arc.DB.Batch(func(tx *bbolt.Tx) error {
bucket := tx.Bucket([]byte(result.Name))
if bucket != nil {
return nil
}
bucket, err := tx.CreateBucketIfNotExists([]byte(result.Name))
if err != nil {
return err
}
err = bucket.Put([]byte("content"), result.Content)
if err != nil {
return err
}
err = bucket.Put([]byte("type"), []byte(result.ContentType))
if err != nil {
return err
}
return nil
})
return err
}

View File

@ -0,0 +1,38 @@
package archiver
import (
"crypto/tls"
"net/http"
"net/http/cookiejar"
"time"
)
var (
defaultClient *http.Client
)
func init() {
jar, _ := cookiejar.New(nil)
defaultClient = &http.Client{
Timeout: time.Minute,
Transport: &http.Transport{
TLSClientConfig: &tls.Config{
InsecureSkipVerify: true,
},
},
Jar: jar,
}
}
// DownloadData downloads data from the specified URL.
func DownloadData(url string) (*http.Response, error) {
// Prepare request
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
// Send request
req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)")
return defaultClient.Do(req)
}

View File

@ -0,0 +1,43 @@
package archiver
import "github.com/sirupsen/logrus"
type logType int
const (
infoLog logType = iota
errorLog
warningLog
)
// Log prints the log ended with newline.
func (arc *Archiver) Log(tp logType, msgs ...interface{}) {
if !arc.LogEnabled {
return
}
switch tp {
case errorLog:
logrus.Errorln(msgs...)
case warningLog:
logrus.Warnln(msgs...)
default:
logrus.Infoln(msgs...)
}
}
// Logf print log with specified format.
func (arc *Archiver) Logf(tp logType, format string, msgs ...interface{}) {
if !arc.LogEnabled {
return
}
switch tp {
case errorLog:
logrus.Errorf(format, msgs...)
case warningLog:
logrus.Warnf(format, msgs...)
default:
logrus.Infof(format, msgs...)
}
}

View File

@ -0,0 +1,468 @@
package archiver
import (
"bytes"
"fmt"
"io"
nurl "net/url"
"regexp"
"strings"
"github.com/tdewolff/parse/v2/css"
"github.com/tdewolff/parse/v2/js"
"golang.org/x/net/html"
)
// ProcessResult is the result from content processing.
type ProcessResult struct {
Name string
ContentType string
Content []byte
}
var (
rxImageMeta = regexp.MustCompile(`(?i)image|thumbnail`)
rxLazyImageSrcset = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)\s+\d`)
rxLazyImageSrc = regexp.MustCompile(`(?i)^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$`)
)
// ProcessHTMLFile process HTML file that submitted through the io.Reader.
func (arc *Archiver) ProcessHTMLFile(res ResourceURL, input io.Reader) (result ProcessResult, resources []ResourceURL, err error) {
// Parse HTML document
doc, err := html.Parse(input)
if err != nil {
return ProcessResult{}, nil, fmt.Errorf("failed to parse HTML for %s: %v", res.DownloadURL, err)
}
// Parse URL
parsedURL, err := nurl.ParseRequestURI(res.DownloadURL)
if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" {
return ProcessResult{}, nil, fmt.Errorf("url %s is not valid", res.DownloadURL)
}
// Convert lazy loaded image to normal
fixLazyImages(doc)
// Convert hyperlinks rith relative URL
fixRelativeURIs(doc, parsedURL)
// Extract resources from each nodes
for _, node := range getElementsByTagName(doc, "*") {
// First extract resources from inline style
cssResources := extractInlineCSS(node, parsedURL)
resources = append(resources, cssResources...)
// Next extract resources from tag's specific attribute
nodeResources := []ResourceURL{}
switch tagName(node) {
case "style":
nodeResources = extractStyleTag(node, parsedURL)
case "script":
nodeResources = extractScriptTag(node, parsedURL)
case "meta":
nodeResources = extractMetaTag(node, parsedURL)
case "img", "picture", "figure", "video", "audio", "source":
nodeResources = extractMediaTag(node, parsedURL)
case "link":
nodeResources = extractGenericTag(node, "href", parsedURL)
case "iframe":
nodeResources = extractGenericTag(node, "src", parsedURL)
case "object":
nodeResources = extractGenericTag(node, "data", parsedURL)
default:
continue
}
resources = append(resources, nodeResources...)
}
// Get outer HTML of the doc
result = ProcessResult{
Name: res.ArchivalURL,
Content: outerHTML(doc),
}
return result, resources, nil
}
// ProcessCSSFile process CSS file that submitted through the io.Reader.
func (arc *Archiver) ProcessCSSFile(res ResourceURL, input io.Reader) (result ProcessResult, resources []ResourceURL, err error) {
// Parse URL
parsedURL, err := nurl.ParseRequestURI(res.DownloadURL)
if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" {
return ProcessResult{}, nil, fmt.Errorf("url %s is not valid", res.DownloadURL)
}
// Extract CSS rules
rules, resources := processCSS(input, parsedURL)
result = ProcessResult{
Name: res.ArchivalURL,
Content: []byte(rules),
}
return result, resources, nil
}
// ProcessOtherFile process files that not HTML, JS or CSS that submitted through the io.Reader.
func (arc *Archiver) ProcessOtherFile(res ResourceURL, input io.Reader) (result ProcessResult, err error) {
// Copy data to buffer
buffer := bytes.NewBuffer(nil)
_, err = io.Copy(buffer, input)
if err != nil {
return ProcessResult{}, fmt.Errorf("failed to copy data: %v", err)
}
// Create result
result = ProcessResult{
Name: res.ArchivalURL,
Content: buffer.Bytes(),
}
return result, nil
}
// fixRelativeURIs converts each <a> in the given element
// to an absolute URI, ignoring #ref URIs.
func fixRelativeURIs(doc *html.Node, pageURL *nurl.URL) {
links := getAllNodesWithTag(doc, "a")
forEachNode(links, func(link *html.Node, _ int) {
href := getAttribute(link, "href")
if href == "" {
return
}
// Replace links with javascript: URIs with text content,
// since they won't work after scripts have been removed
// from the page.
if strings.HasPrefix(href, "javascript:") {
text := createTextNode(textContent(link))
replaceNode(link, text)
} else {
newHref := toAbsoluteURI(href, pageURL)
if newHref == "" {
removeAttribute(link, "href")
} else {
setAttribute(link, "href", newHref)
}
}
})
}
// fixLazyImages convert images and figures that have properties like data-src into
// images that can be loaded without JS.
func fixLazyImages(root *html.Node) {
imageNodes := getAllNodesWithTag(root, "img", "picture", "figure")
forEachNode(imageNodes, func(elem *html.Node, _ int) {
src := getAttribute(elem, "src")
srcset := getAttribute(elem, "srcset")
nodeTag := tagName(elem)
nodeClass := className(elem)
if (src == "" && srcset == "") || strings.Contains(strings.ToLower(nodeClass), "lazy") {
for i := 0; i < len(elem.Attr); i++ {
attr := elem.Attr[i]
if attr.Key == "src" || attr.Key == "srcset" {
continue
}
copyTo := ""
if rxLazyImageSrcset.MatchString(attr.Val) {
copyTo = "srcset"
} else if rxLazyImageSrc.MatchString(attr.Val) {
copyTo = "src"
}
if copyTo == "" {
continue
}
if nodeTag == "img" || nodeTag == "picture" {
// if this is an img or picture, set the attribute directly
setAttribute(elem, copyTo, attr.Val)
} else if nodeTag == "figure" && len(getAllNodesWithTag(elem, "img", "picture")) == 0 {
// if the item is a <figure> that does not contain an image or picture,
// create one and place it inside the figure see the nytimes-3
// testcase for an example
img := createElement("img")
setAttribute(img, copyTo, attr.Val)
appendChild(elem, img)
}
}
}
})
}
// extractInlineCSS extract archive's resource from the CSS rules inside
// style attribute. Once finished, all CSS URLs in the style attribute
// will be updated to use the archival URL.
func extractInlineCSS(node *html.Node, pageURL *nurl.URL) []ResourceURL {
// Make sure this node has inline style
styleAttr := getAttribute(node, "style")
if styleAttr == "" {
return nil
}
// Extract resource URLs from the inline style
// and update the CSS rules accordingly.
reader := strings.NewReader(styleAttr)
newStyleAttr, resources := processCSS(reader, pageURL)
setAttribute(node, "style", newStyleAttr)
return resources
}
// extractStyleTag extract archive's resource from inside a <style> tag.
// Once finished, all CSS URLs will be updated to use the archival URL.
func extractStyleTag(node *html.Node, pageURL *nurl.URL) []ResourceURL {
// Extract CSS rules from <style>
rules := textContent(node)
rules = strings.TrimSpace(rules)
if rules == "" {
return nil
}
// Extract resource URLs from the rules and update it accordingly.
reader := strings.NewReader(rules)
newRules, resources := processCSS(reader, pageURL)
setTextContent(node, newRules)
return resources
}
// extractScriptTag extract archive's resource from inside a <script> tag.
// Once finished, all URLs inside it will be updated to use the archival URL.
func extractScriptTag(node *html.Node, pageURL *nurl.URL) []ResourceURL {
// Also get the URL from `src` attribute
resources := extractGenericTag(node, "src", pageURL)
// Extract JS code from the <script> itself
script := textContent(node)
script = strings.TrimSpace(script)
if script == "" {
return resources
}
reader := strings.NewReader(script)
newScript, scriptResources := processJS(reader, pageURL)
setTextContent(node, newScript)
resources = append(resources, scriptResources...)
return resources
}
// extractMetaTag extract archive's resource from inside a <meta>.
// Normally, <meta> doesn't have any resource URLs. However, as
// social media come and grow, a new metadata is added to contain
// the hero image for a web page, e.g. og:image, twitter:image, etc.
// Once finished, all URLs in <meta> for image will be updated
// to use the archival URL.
func extractMetaTag(node *html.Node, pageURL *nurl.URL) []ResourceURL {
// Get the needed attributes
name := getAttribute(node, "name")
property := getAttribute(node, "property")
content := getAttribute(node, "content")
// If this <meta> is not for image, don't process it
if !rxImageMeta.MatchString(name + " " + property) {
return nil
}
// If URL is not valid, skip
tmp, err := nurl.ParseRequestURI(content)
if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" {
return nil
}
// Create archive resource and update the href URL
res := ToResourceURL(content, pageURL)
if res.ArchivalURL == "" {
return nil
}
setAttribute(node, "content", res.ArchivalURL)
return []ResourceURL{res}
}
// extractMediaTag extract resource from inside a media tag e.g.
// <img>, <video>, <audio>, <source>. Once finished, all URLs will be
// updated to use the archival URL.
func extractMediaTag(node *html.Node, pageURL *nurl.URL) []ResourceURL {
// Get the needed attributes
src := getAttribute(node, "src")
poster := getAttribute(node, "poster")
strSrcSets := getAttribute(node, "srcset")
// Create initial resources
resources := []ResourceURL{}
// Save `src` and `poster` to resources
if src != "" {
res := ToResourceURL(src, pageURL)
if res.ArchivalURL != "" {
setAttribute(node, "src", res.ArchivalURL)
resources = append(resources, res)
}
}
if poster != "" {
res := ToResourceURL(poster, pageURL)
if res.ArchivalURL != "" {
setAttribute(node, "poster", res.ArchivalURL)
resources = append(resources, res)
}
}
// Split srcset by comma, then process it like any URLs
srcSets := strings.Split(strSrcSets, ",")
for i, srcSet := range srcSets {
srcSet = strings.TrimSpace(srcSet)
parts := strings.SplitN(srcSet, " ", 2)
if parts[0] == "" {
continue
}
res := ToResourceURL(parts[0], pageURL)
if res.ArchivalURL == "" {
continue
}
srcSets[i] = strings.Replace(srcSets[i], parts[0], res.ArchivalURL, 1)
resources = append(resources, res)
}
if len(srcSets) > 0 {
setAttribute(node, "srcset", strings.Join(srcSets, ","))
}
return resources
}
// extractGenericTag extract resource from specified attribute.
// This method is used for tags where the URL is obviously exist in
// the tag, without any additional process needed to extract it.
// For example is <link> with its href, <object> with its data, etc.
// Once finished, the URL attribute will be updated to use the
// archival URL.
func extractGenericTag(node *html.Node, attrName string, pageURL *nurl.URL) []ResourceURL {
// Get the needed attributes
attrValue := getAttribute(node, attrName)
if attrValue == "" {
return nil
}
res := ToResourceURL(attrValue, pageURL)
if res.ArchivalURL == "" {
return nil
}
setAttribute(node, attrName, res.ArchivalURL)
return []ResourceURL{res}
}
// processCSSRules extract resource URLs from the specified CSS input.
// Returns the new rules with all CSS URLs updated to the archival link.
func processCSS(input io.Reader, baseURL *nurl.URL) (string, []ResourceURL) {
// Prepare buffers
buffer := bytes.NewBuffer(nil)
// Scan CSS file and process the resource's URL
lexer := css.NewLexer(input)
resources := []ResourceURL{}
for {
token, bt := lexer.Next()
// Check for error
if token == css.ErrorToken {
break
}
// If it's not an URL, just write it to buffer as it is
if token != css.URLToken {
buffer.Write(bt)
continue
}
// Sanitize the URL by removing `url()`, quotation mark and trailing slash
cssURL := string(bt)
cssURL = rxStyleURL.ReplaceAllString(cssURL, "$1")
cssURL = rxSingleQuote.ReplaceAllString(cssURL, "$1")
cssURL = rxDoubleQuote.ReplaceAllString(cssURL, "$1")
// Save the CSS URL and replace it with archival URL
res := ToResourceURL(cssURL, baseURL)
if res.ArchivalURL == "" {
continue
}
cssURL = `url("` + res.ArchivalURL + `")`
buffer.WriteString(cssURL)
resources = append(resources, res)
}
// Return the new rule after all URL has been processed
return buffer.String(), resources
}
// processJavascript extract resource URLs from the specified JS input.
// Returns the new rules with all URLs updated to the archival link.
func processJS(input io.Reader, baseURL *nurl.URL) (string, []ResourceURL) {
// Prepare buffers
buffer := bytes.NewBuffer(nil)
// Scan JS file and process the resource's URL
lexer := js.NewLexer(input)
resources := []ResourceURL{}
for {
token, bt := lexer.Next()
// Check for error
if token == js.ErrorToken {
break
}
// If it's not a string, just write it to buffer as it is
if token != js.StringToken {
buffer.Write(bt)
continue
}
// Process the string.
// Unlike CSS, JS doesn't have it's own URL token. So, we can only guess whether
// a string is URL or not. There are three criteria to decide if it's URL :
// - It started with http(s):// for absolute URL
// - It started with slash (/) for relative URL
// - It surrounded by `url()` just like CSS
// If it doesn't fulfill any of criteria above, just write it as it is.
var res ResourceURL
var newURL string
text := string(bt)
text = rxSingleQuote.ReplaceAllString(text, "$1")
text = rxDoubleQuote.ReplaceAllString(text, "$1")
if strings.HasPrefix(text, "url(") {
cssURL := rxStyleURL.ReplaceAllString(text, "$1")
cssURL = rxSingleQuote.ReplaceAllString(cssURL, "$1")
cssURL = rxDoubleQuote.ReplaceAllString(cssURL, "$1")
res = ToResourceURL(cssURL, baseURL)
newURL = fmt.Sprintf("\"url('%s')\"", res.ArchivalURL)
} else {
buffer.Write(bt)
continue
}
if res.ArchivalURL == "" {
continue
}
buffer.WriteString(newURL)
resources = append(resources, res)
}
// Return the new rule after all URL has been processed
return buffer.String(), resources
}

View File

@ -0,0 +1,50 @@
package archiver
import (
nurl "net/url"
"regexp"
"strings"
)
var (
rxHTTPScheme = regexp.MustCompile(`(?i)^https?:\/{2}`)
rxTrailingSlash = regexp.MustCompile(`(?i)/+$`)
rxRepeatedStrip = regexp.MustCompile(`(?i)-+`)
)
// ResourceURL is strcut that contains URL for downloading
// and archiving a resource.
type ResourceURL struct {
DownloadURL string
ArchivalURL string
Parent string
}
// ToResourceURL generates an uri into a Resource URL.
func ToResourceURL(uri string, base *nurl.URL) ResourceURL {
// Make sure URL has a valid scheme
uri = strings.TrimSpace(uri)
switch {
case uri == "",
strings.Contains(uri, ":") && !rxHTTPScheme.MatchString(uri):
return ResourceURL{}
}
// Create archive URL
downloadURL := toAbsoluteURI(uri, base)
downloadURL = rxTrailingSlash.ReplaceAllString(downloadURL, "")
downloadURL = strings.ReplaceAll(downloadURL, " ", "+")
archivalURL := strings.Replace(downloadURL, "://", "/", 1)
archivalURL = strings.ReplaceAll(archivalURL, "?", "-")
archivalURL = strings.ReplaceAll(archivalURL, "#", "-")
archivalURL = strings.ReplaceAll(archivalURL, "/", "-")
archivalURL = strings.ReplaceAll(archivalURL, " ", "-")
archivalURL = rxRepeatedStrip.ReplaceAllString(archivalURL, "-")
return ResourceURL{
DownloadURL: downloadURL,
ArchivalURL: archivalURL,
Parent: base.String(),
}
}

View File

@ -0,0 +1,334 @@
package archiver
import (
"bytes"
"strings"
"golang.org/x/net/html"
)
// getElementsByTagName returns a collection of all elements in the document with
// the specified tag name, as an array of Node object.
// The special tag "*" will represents all elements.
func getElementsByTagName(doc *html.Node, tagName string) []*html.Node {
var results []*html.Node
var finder func(*html.Node)
finder = func(node *html.Node) {
if node.Type == html.ElementNode && (tagName == "*" || node.Data == tagName) {
results = append(results, node)
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
finder(child)
}
}
for child := doc.FirstChild; child != nil; child = child.NextSibling {
finder(child)
}
return results
}
// createElement creates a new ElementNode with specified tag.
func createElement(tagName string) *html.Node {
return &html.Node{
Type: html.ElementNode,
Data: tagName,
}
}
// createTextNode creates a new Text node.
func createTextNode(data string) *html.Node {
return &html.Node{
Type: html.TextNode,
Data: data,
}
}
// tagName returns the tag name of a Node.
// If it's not ElementNode, return empty string.
func tagName(node *html.Node) string {
if node.Type != html.ElementNode {
return ""
}
return node.Data
}
// getAttribute returns the value of a specified attribute on
// the element. If the given attribute does not exist, the value
// returned will be an empty string.
func getAttribute(node *html.Node, attrName string) string {
for i := 0; i < len(node.Attr); i++ {
if node.Attr[i].Key == attrName {
return node.Attr[i].Val
}
}
return ""
}
// setAttribute sets attribute for node. If attribute already exists,
// it will be replaced.
func setAttribute(node *html.Node, attrName string, attrValue string) {
attrIdx := -1
for i := 0; i < len(node.Attr); i++ {
if node.Attr[i].Key == attrName {
attrIdx = i
break
}
}
if attrIdx >= 0 {
node.Attr[attrIdx].Val = attrValue
} else {
node.Attr = append(node.Attr, html.Attribute{
Key: attrName,
Val: attrValue,
})
}
}
// removeAttribute removes attribute with given name.
func removeAttribute(node *html.Node, attrName string) {
attrIdx := -1
for i := 0; i < len(node.Attr); i++ {
if node.Attr[i].Key == attrName {
attrIdx = i
break
}
}
if attrIdx >= 0 {
a := node.Attr
a = append(a[:attrIdx], a[attrIdx+1:]...)
node.Attr = a
}
}
// hasAttribute returns a Boolean value indicating whether the
// specified node has the specified attribute or not.
func hasAttribute(node *html.Node, attrName string) bool {
for i := 0; i < len(node.Attr); i++ {
if node.Attr[i].Key == attrName {
return true
}
}
return false
}
// textContent returns the text content of the specified node,
// and all its descendants.
func textContent(node *html.Node) string {
var buffer bytes.Buffer
var finder func(*html.Node)
finder = func(n *html.Node) {
if n.Type == html.TextNode {
buffer.WriteString(n.Data)
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
finder(child)
}
}
finder(node)
return buffer.String()
}
// outerHTML returns an HTML serialization of the element and its descendants.
func outerHTML(node *html.Node) []byte {
var buffer bytes.Buffer
err := html.Render(&buffer, node)
if err != nil {
return []byte{}
}
return buffer.Bytes()
}
// innerHTML returns the HTML content (inner HTML) of an element.
func innerHTML(node *html.Node) string {
var err error
var buffer bytes.Buffer
for child := node.FirstChild; child != nil; child = child.NextSibling {
err = html.Render(&buffer, child)
if err != nil {
return ""
}
}
return strings.TrimSpace(buffer.String())
}
// documentElement returns the Element that is the root element
// of the document. Since we are working with HTML document,
// the root will be <html> element for HTML documents).
func documentElement(doc *html.Node) *html.Node {
if nodes := getElementsByTagName(doc, "html"); len(nodes) > 0 {
return nodes[0]
}
return nil
}
// id returns the value of the id attribute of the specified element.
func id(node *html.Node) string {
id := getAttribute(node, "id")
id = strings.TrimSpace(id)
return id
}
// className returns the value of the class attribute of
// the specified element.
func className(node *html.Node) string {
className := getAttribute(node, "class")
className = strings.TrimSpace(className)
className = strings.Join(strings.Fields(className), " ")
return className
}
// children returns an HTMLCollection of the child elements of Node.
func children(node *html.Node) []*html.Node {
var children []*html.Node
if node == nil {
return nil
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
if child.Type == html.ElementNode {
children = append(children, child)
}
}
return children
}
// childNodes returns list of a node's direct children.
func childNodes(node *html.Node) []*html.Node {
var childNodes []*html.Node
for child := node.FirstChild; child != nil; child = child.NextSibling {
childNodes = append(childNodes, child)
}
return childNodes
}
// firstElementChild returns the object's first child Element,
// or nil if there are no child elements.
func firstElementChild(node *html.Node) *html.Node {
for child := node.FirstChild; child != nil; child = child.NextSibling {
if child.Type == html.ElementNode {
return child
}
}
return nil
}
// nextElementSibling returns the Element immediately following
// the specified one in its parent's children list, or nil if the
// specified Element is the last one in the list.
func nextElementSibling(node *html.Node) *html.Node {
for sibling := node.NextSibling; sibling != nil; sibling = sibling.NextSibling {
if sibling.Type == html.ElementNode {
return sibling
}
}
return nil
}
// appendChild adds a node to the end of the list of children of a
// specified parent node. If the given child is a reference to an
// existing node in the document, appendChild() moves it from its
// current position to the new position.
func appendChild(node *html.Node, child *html.Node) {
if child.Parent != nil {
temp := cloneNode(child)
node.AppendChild(temp)
child.Parent.RemoveChild(child)
} else {
node.AppendChild(child)
}
}
// replaceNode replaces an OldNode with a NewNode.
func replaceNode(oldNode *html.Node, newNode *html.Node) {
if oldNode.Parent == nil {
return
}
newNode.Parent = nil
newNode.PrevSibling = nil
newNode.NextSibling = nil
oldNode.Parent.InsertBefore(newNode, oldNode)
oldNode.Parent.RemoveChild(oldNode)
}
// includeNode determines if node is included inside nodeList.
func includeNode(nodeList []*html.Node, node *html.Node) bool {
for i := 0; i < len(nodeList); i++ {
if nodeList[i] == node {
return true
}
}
return false
}
// cloneNode returns a deep clone of the node and its children.
// However, it will be detached from the original's parents
// and siblings.
func cloneNode(src *html.Node) *html.Node {
clone := &html.Node{
Type: src.Type,
DataAtom: src.DataAtom,
Data: src.Data,
Attr: make([]html.Attribute, len(src.Attr)),
}
copy(clone.Attr, src.Attr)
for child := src.FirstChild; child != nil; child = child.NextSibling {
clone.AppendChild(cloneNode(child))
}
return clone
}
func getAllNodesWithTag(node *html.Node, tagNames ...string) []*html.Node {
var result []*html.Node
for i := 0; i < len(tagNames); i++ {
result = append(result, getElementsByTagName(node, tagNames[i])...)
}
return result
}
// forEachNode iterates over a NodeList and runs fn on each node.
func forEachNode(nodeList []*html.Node, fn func(*html.Node, int)) {
for i := 0; i < len(nodeList); i++ {
fn(nodeList[i], i)
}
}
// removeNodes iterates over a NodeList, calls `filterFn` for each node
// and removes node if function returned `true`. If function is not
// passed, removes all the nodes in node list.
func removeNodes(nodeList []*html.Node, filterFn func(*html.Node) bool) {
for i := len(nodeList) - 1; i >= 0; i-- {
node := nodeList[i]
parentNode := node.Parent
if parentNode != nil && (filterFn == nil || filterFn(node)) {
parentNode.RemoveChild(node)
}
}
}
// setTextContent sets the text content of the specified node.
func setTextContent(node *html.Node, text string) {
for child := node.FirstChild; child != nil; child = child.NextSibling {
if child.Parent != nil {
child.Parent.RemoveChild(child)
}
}
node.AppendChild(&html.Node{
Type: html.TextNode,
Data: text,
})
}

View File

@ -0,0 +1,54 @@
package archiver
import (
nurl "net/url"
"regexp"
"strings"
)
var (
rxStyleURL = regexp.MustCompile(`(?i)^url\((.+)\)$`)
rxSingleQuote = regexp.MustCompile(`(?i)^'(.*)'$`)
rxDoubleQuote = regexp.MustCompile(`(?i)^"(.*)"$`)
rxJSContentType = regexp.MustCompile(`(?i)(text|application)/(java|ecma)script`)
)
func clearUTMParams(url *nurl.URL) {
queries := url.Query()
for key := range queries {
if strings.HasPrefix(key, "utm_") {
queries.Del(key)
}
}
url.RawQuery = queries.Encode()
}
// toAbsoluteURI convert uri to absolute path based on base.
// However, if uri is prefixed with hash (#), the uri won't be changed.
func toAbsoluteURI(uri string, base *nurl.URL) string {
if uri == "" || base == nil {
return ""
}
// If it is hash tag, return as it is
if uri[:1] == "#" {
return uri
}
// If it is already an absolute URL, return as it is
tmp, err := nurl.ParseRequestURI(uri)
if err == nil && tmp.Scheme != "" && tmp.Hostname() != "" {
return uri
}
// Otherwise, resolve against base URI.
tmp, err = nurl.Parse(uri)
if err != nil {
return uri
}
clearUTMParams(tmp)
return base.ResolveReference(tmp).String()
}

76
pkg/warc/reader.go Normal file
View File

@ -0,0 +1,76 @@
package warc
import (
"fmt"
"os"
"go.etcd.io/bbolt"
)
// Archive is the storage for archiving the web page.
type Archive struct {
db *bbolt.DB
}
// Open opens the archive from specified path.
func Open(path string) (*Archive, error) {
// Make sure archive exists
info, err := os.Stat(path)
if os.IsNotExist(err) || info.IsDir() {
return nil, fmt.Errorf("archive doesn't exist")
}
// Open database
options := &bbolt.Options{
ReadOnly: true,
}
db, err := bbolt.Open(path, os.ModePerm, options)
if err != nil {
return nil, err
}
return &Archive{db: db}, nil
}
// Close closes the storage.
func (arc *Archive) Close() {
arc.db.Close()
}
// Read fetch the resource with specified name from archive.
func (arc *Archive) Read(name string) ([]byte, string, error) {
// Make sure name exists
if name == "" {
name = "archive-root"
}
var content []byte
var strContentType string
err := arc.db.View(func(tx *bbolt.Tx) error {
bucket := tx.Bucket([]byte(name))
if bucket == nil {
return fmt.Errorf("%s doesn't exist", name)
}
contentType := bucket.Get([]byte("type"))
if contentType == nil {
return fmt.Errorf("%s doesn't exist", name)
}
strContentType = string(contentType)
content = bucket.Get([]byte("content"))
if content == nil {
return fmt.Errorf("%s doesn't exist", name)
}
return nil
})
if err != nil {
return nil, "", err
}
return content, strContentType, nil
}

105
pkg/warc/writer.go Normal file
View File

@ -0,0 +1,105 @@
package warc
import (
"fmt"
"io"
nurl "net/url"
"os"
fp "path/filepath"
"strings"
"time"
"github.com/go-shiori/shiori/pkg/warc/internal/archiver"
"go.etcd.io/bbolt"
)
// FromReader create archive from the specified io.Reader.
func FromReader(input io.Reader, url, contentType, dstPath string) error {
// Make sure URL is valid
parsedURL, err := nurl.ParseRequestURI(url)
if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" {
return fmt.Errorf("url %s is not valid", url)
}
// Generate resource URL
res := archiver.ToResourceURL(url, parsedURL)
res.ArchivalURL = "archive-root"
// Create database for archive
os.MkdirAll(fp.Dir(dstPath), os.ModePerm)
db, err := bbolt.Open(dstPath, os.ModePerm, nil)
if err != nil {
return fmt.Errorf("failed to create archive: %v", err)
}
// Create archiver
arc := &archiver.Archiver{
DB: db,
ChDone: make(chan struct{}),
ChErrors: make(chan error),
ChWarnings: make(chan error),
ChRequest: make(chan archiver.ResourceURL, 10),
ResourceMap: make(map[string]struct{}),
LogEnabled: true,
}
defer arc.Close()
// Process input depending on its type.
// If it's HTML, we need to extract the sub resources that used by it, e.g some CSS or JS files.
// If it's not HTML, we can just save it to archive.
var result archiver.ProcessResult
var subResources []archiver.ResourceURL
if strings.Contains(contentType, "text/html") {
result, subResources, err = arc.ProcessHTMLFile(res, input)
} else {
result, err = arc.ProcessOtherFile(res, input)
}
if err != nil {
return fmt.Errorf("archival failed: %v", err)
}
// Add this url to resource map to mark it as processed
arc.ResourceMap[res.DownloadURL] = struct{}{}
// Save content to storage
arc.Logf(0, "Downloaded %s", res.DownloadURL)
result.ContentType = contentType
err = arc.SaveToStorage(result)
if err != nil {
return fmt.Errorf("failed to save %s: %v", res.DownloadURL, err)
}
// If there are no sub resources found, our job is finished.
if len(subResources) == 0 {
return nil
}
// However, if there are, we need to run the archiver in background to
// process the sub resources concurrently.
go func() {
for _, subRes := range subResources {
arc.ChRequest <- subRes
}
}()
time.Sleep(time.Second)
arc.StartArchiver()
return nil
}
// FromURL create archive from the specified URL.
func FromURL(url, dstPath string) error {
// Download URL
resp, err := archiver.DownloadData(url)
if err != nil {
return fmt.Errorf("failed to download %s: %v", url, err)
}
defer resp.Body.Close()
contentType := resp.Header.Get("Content-Type")
return FromReader(resp.Body, url, contentType, dstPath)
}