web scraping - how to have go find packages online? -
i'm trying run go program using liteide x22 message
c:/go/bin/go.exe build [c:/users/admins/desktop/desktp/worm_scraper-master] worm_scraper.go:11:2: cannot find package "github.com/codegangsta/cli" in of: c:\go\src\pkg\github.com\codegangsta\cli (from $goroot) c:\users\admins\gostuff\src\github.com\codegangsta\cli (from $gopath) worm_scraper.go:12:2: cannot find package "github.com/puerkitobio/goquery" in of: c:\go\src\pkg\github.com\puerkitobio\goquery (from $goroot) c:\users\admins\gostuff\src\github.com\puerkitobio\goquery (from $gopath) error: process exited code 1.
i think means it's looking on harddrive instead of online right? (btw i'm pretty clueless programming trying else wrote) how access web? here's full code
package main import ( "errors" "fmt" "os" "os/exec" "regexp" "strings" "github.com/codegangsta/cli" "github.com/puerkitobio/goquery" ) const ( mainsite = "https://parahumans.wordpress.com/" tableofcontents = "https://parahumans.wordpress.com/table-of-contents/" ) type arc struct { identifier string title string chapters []chapter } type chapter struct { title string url string tags []string paragraphs []paragraph retries int dateposted string } type paragraph string // format paragraph func (p *paragraph) format() { s := string(*p) // handle emphasis s = strings.replace(s, "<em>", "*", -1) s = strings.replace(s, "</em>", "*", -1) s = strings.replace(s, "<i>", "*", -1) s = strings.replace(s, "</i>", "*", -1) // handle bold s = strings.replace(s, "<strong>", "**", -1) s = strings.replace(s, "</strong>", "**", -1) s = strings.replace(s, "<b>", "**", -1) s = strings.replace(s, "</b>", "**", -1) // remove new lines s = strings.replace(s, "\n", "", -1) // , random double spaces s = strings.replace(s, ". ", ". ", -1) *p = paragraph(s) } // return arc given chapter belongs func (ch *chapter) whicharc(arclist []*arc) (*arc, error) { _, arc := range arclist { if strings.replace(ch.title[:2], ".", "", -1) == arc.identifier { return arc, nil } } return &arc{}, errors.new("chapter '" + ch.title + "' did not match arcs") } // parse chapter , return func (ch *chapter) parse(done chan bool) { if ch.retries > 3 { panic("chapter url '" + ch.url + "' has timed out many times") } // chapter if strings.hasprefix(ch.url, "http") == false { // make sure begins http goquery can use ch.url = "https://" + ch.url } doc, err := goquery.newdocument(ch.url) if err != nil { // try again ch.retries++ go ch.parse(done) return } // set new chapter title ch.title = doc.find("h1.entry-title").text() // set tags doc.find(".entry-meta a[rel=tag]").each(func(_ int, s *goquery.selection) { ch.tags = append(ch.tags, s.text()) if len(ch.tags) == 0 { ch.tags = append(ch.tags, "none") } }) // date posted ch.dateposted = doc.find("time.entry-date").text() // we'll paragraphs doc.find(".entry-content > p").each(func(_ int, s *goquery.selection) { // check previous/next links if len(s.find("a").nodes) > 0 { return } // paragraph html st, _ := s.html() para := paragraph("") // actual paragraph if val, exists := s.attr("padding-left"); exists && val == "30px" { // check see if paragraph special (indented) block para = paragraph(" " + st) } else if val, exists := s.attr("text-align"); exists && val == "center" { // otherwise check see if it's separator paragraph para = paragraph("----------") } else { // it's normal paragraph in case para = paragraph(st) } // , add paragraph chapter para.format() ch.paragraphs = append(ch.paragraphs, para) }) // finally, let's signal success done <- true } // return slice of arcs extracted table of contents func parsearcs(s string) []*arc { arcs := []*arc{} r, _ := regexp.compile(`[0-9]+`) _, line := range strings.split(s, "\n") { line = strings.trimspace(line) if strings.hasprefix(line, "arc") { arcs = append(arcs, &arc{ identifier: r.findstring(line), title: line, }) } else if strings.hasprefix(line, "epilogue") { arcs = append(arcs, &arc{ identifier: "e", title: line, }) } } return arcs } func main() { // define app app := cli.newapp() app.name = "worm scraper" app.usage = "a tool let updated epub copy of serial web novel worm, wildbow" app.version = "1.0" app.author = "benjamin harris" // define application flags app.flags = []cli.flag{ cli.boolflag{"pdf", "save book pdf instead of epub, if possible"}, cli.boolflag{"with-link", "include link chapter online"}, cli.boolflag{"with-tags", "include tags each chapter posted under"}, cli.boolflag{"with-date", "include date each chapter posted"}, } // heart of application app.action = func(context *cli.context) { // starting program fmt.println("starting scrape worm") // list of arcs table of contents fmt.println("gathering links table of contents...") contents, err := goquery.newdocument(tableofcontents) if err != nil { panic("failed table of contents! " + err.error()) } // parse arcs arcs := parsearcs(contents.find(".entry-content").text()) // links arc chapters contents.find(".entry-content a:not([class*=share-icon])").each(func(_ int, s *goquery.selection) { ch := chapter{} ch.title = strings.replace(strings.trimspace(s.text()), "\n", "", -1) ch.url, _ = s.attr("href") if ch.title == "" { return } arc, _ := ch.whicharc(arcs) arc.chapters = append(arc.chapters, ch) }) // manually add missing chapter in epilogue c := chapter{ title: "e.2", url: "https://parahumans.wordpress.com/2013/11/05/teneral-e-2/", } a, _ := c.whicharc(arcs) a.chapters = append(a.chapters, c) copy(a.chapters[1+1:], a.chapters[1:]) a.chapters[1] = c // start getting chapters chapters := 0 done := make(chan bool) _, arc := range arcs { i, _ := range arc.chapters { chapters++ go arc.chapters[i].parse(done) } } fmt.println("starting parse", chapters, "chapters") fmt.print("finished: ") totalchapters := chapters { select { case <-done: chapters-- fmt.print(totalchapters-chapters, ",") } if chapters == 0 { // we're done chapters close(done) fmt.println() break } } // , let's write stuff file fmt.println("saving results file...") f, err := os.openfile("worm.md", os.o_rdwr|os.o_create|os.o_excl, 0666) if err != nil { panic(err) } defer f.close() // define pagebreak pagebreak := "\n\n" // write cover f.writestring("# worm\n\n") f.writestring("by wildbow\n\n") f.writestring("website: " + mainsite) // loop through arcs _, arc := range arcs { f.writestring(pagebreak + "# " + arc.title) _, chapter := range arc.chapters { f.writestring("\n\n") f.writestring("## " + chapter.title + "\n\n") if context.bool("with-tags") { f.writestring("**tags:** " + strings.join(chapter.tags, ", ") + " ") } if context.bool("with-date") { f.writestring("**date:** " + chapter.dateposted + " ") } if context.bool("with-link") { f.writestring("**link:** " + chapter.url + " ") } f.writestring("\n\n") // save chapter's paragraphs _, p := range chapter.paragraphs { f.writestring(string(p) + "\n\n") } } } // let's try convert markdown file ebook format (epub, pdf) fmt.print("attempting convert markdown file... ") cmdtext := []string{"-s", "worm.md", "--epub-chapter-level", "2", "-o", "worm.epub"} if context.bool("pdf") { cmdtext = []string{"worm.md", "-o", "worm.pdf"} pagebreak = `<div style="page-break-after: always;"></div>` } cmd := exec.command("pandoc", cmdtext...) err = cmd.run() if err != nil { fmt.println("conversion failed! make sure you've installed pandoc (http://johnmacfarlane.net/pandoc/installing.html) if want convert generated markdown file ebook compatible format. in meantime, we've left markdown file.") } else { _ = os.remove("worm.md") fmt.println("completed!") } } // run application app.run(os.args) }
oh possible modify output .txt or .mobi? if not i'll convert using calibre. in advance. oh if matters i'm using windows 7 64-bit
the go compiler doesn't import libraries directly internet know how fetch them you. when import github.com/codegangsta/cli
doesn't on url instead looks on gopath/src folder.
the go get
command can fetch library in it's url , download gopath.
if have setup gopath (if not, read how write go code) before running code run command go library
go tool download you. in example should run following commands:
go github.com/codegangsta/cli go github.com/puerkitobio/goquery
that download libraries gopath/src/github.com/codegangsta/cli
, gopath/src/github.com/puerkitobio/goquery
respectively.
Comments
Post a Comment