form parsing and various concurrency fix

jackjhy · Oct 8, 2015 · 44e7059 · 44e7059
1 parent c6de829
commit 44e7059
Show file tree

Hide file tree

Showing 11 changed files with 450 additions and 142 deletions.
diff --git a/cmd/gryffin-distributed/main.go b/cmd/gryffin-distributed/main.go
@@ -19,7 +19,6 @@ import (
 	"github.com/bitly/go-nsq"
 
 	"github.com/yahoo/gryffin"
-	"github.com/yahoo/gryffin/data"
 	"github.com/yahoo/gryffin/fuzzer/arachni"
 	"github.com/yahoo/gryffin/fuzzer/sqlmap"
 	"github.com/yahoo/gryffin/renderer"
@@ -107,7 +106,7 @@ func crawl() {
 	var consumer *nsq.Consumer
 
 	handler := nsq.HandlerFunc(func(m *nsq.Message) error {
-		scan := gryffin.NewScanFromJson(m.Body, t)
+		scan := gryffin.NewScanFromJson(m.Body)
 
 		if delay := scan.RateLimit(); delay != 0 {
 			go func() {
@@ -139,7 +138,7 @@ func crawl() {
 				// Therefore we don't need to test whether the link is coming
 				// from a duplicated page or not
 				for s := range r.GetLinks() {
-					if ok := s.ApplyLinkRules(); ok {
+					if ok := s.ShouldCrawl(); ok {
 						err := producer.Publish("seed", s.Json())
 						if err != nil {
 							fmt.Println("Could not publish", "seed", err)
@@ -166,7 +165,7 @@ func fuzzWithSqlmap() {
 	var consumer *nsq.Consumer
 	handler := nsq.HandlerFunc(func(m *nsq.Message) error {
 		wq <- true
-		scan := gryffin.NewScanFromJson(m.Body, t)
+		scan := gryffin.NewScanFromJson(m.Body)
 		f := &sqlmap.Fuzzer{}
 		f.Fuzz(scan)
 		<-wq
@@ -181,7 +180,7 @@ func fuzzWithArachni() {
 	var consumer *nsq.Consumer
 	handler := nsq.HandlerFunc(func(m *nsq.Message) error {
 		wq <- true
-		scan := gryffin.NewScanFromJson(m.Body, t)
+		scan := gryffin.NewScanFromJson(m.Body)
 		f := &arachni.Fuzzer{}
 		f.Fuzz(scan)
 		<-wq
@@ -224,11 +223,13 @@ func main() {
 		logWriter = io.MultiWriter(os.Stdout, tcpout)
 	}
 
+	gryffin.SetLogWriter(logWriter)
+
 	// we use a buffered channel to block when max concurrency is reach.
 	maxconcurrency := 5
 	wq = make(chan bool, maxconcurrency)
 
-	t = gryffin.NewScan("GET", url, "", data.NewMemoryStore(), logWriter)
+	t = gryffin.NewScan("GET", url, "")
 
 	// seed is unique case that we exit the program immediately
 	if service == "seed" {

diff --git a/cmd/gryffin-standalone/main.go b/cmd/gryffin-standalone/main.go
@@ -15,7 +15,6 @@ import (
 	"time"
 
 	"github.com/yahoo/gryffin"
-	"github.com/yahoo/gryffin/data"
 	"github.com/yahoo/gryffin/fuzzer/arachni"
 	"github.com/yahoo/gryffin/fuzzer/sqlmap"
 	"github.com/yahoo/gryffin/renderer"
@@ -53,16 +52,14 @@ func linkChannels(s *gryffin.Scan) {
 	go func() {
 
 		for scan := range chanCrawl {
-			// scan := <-chanCrawl
 			r := &renderer.PhantomJSRenderer{Timeout: 10}
-			// r := &renderer.NoScriptRenderer{}
 			scan.CrawlAsync(r)
 
 			go func() {
 				if s := <-r.GetRequestBody(); s != nil {
+					// add two workers (two fuzzers)
+					wg.Add(2)
 					chanFuzz <- s
-				} else {
-					wg.Done()
 				}
 
 			}()
@@ -74,12 +71,14 @@ func linkChannels(s *gryffin.Scan) {
 				// Therefore we don't need to test whether the link is coming
 				// from a duplicated page or not
 				for newScan := range r.GetLinks() {
-					if ok := newScan.ApplyLinkRules(); ok {
+					if ok := newScan.ShouldCrawl(); ok {
+						// add one workers (a new crawl)
 						wg.Add(1)
 						chanRateLimit <- newScan
 					}
 				}
-
+				// remove one worker (finish crawl)
+				wg.Done()
 				scan.Logm("Get Links", "Finished")
 
 			}()
@@ -91,15 +90,16 @@ func linkChannels(s *gryffin.Scan) {
 	go func() {
 		for scan := range chanFuzz {
 
-			wg.Add(1) // we got two fuzzers, so add one more worker to the worker group.
 			go func() {
 				f := &arachni.Fuzzer{}
 				f.Fuzz(scan)
+				// remove a fuzzer worker.
 				wg.Done()
 			}()
 			go func() {
 				f := &sqlmap.Fuzzer{}
 				f.Fuzz(scan)
+				// remove a fuzzer worker.
 				wg.Done()
 			}()
 		}
@@ -140,6 +140,8 @@ func linkChannels(s *gryffin.Scan) {
 
 	chanStart <- s
 	close(chanStart)
+
+	// add one worker (start crawl)
 	wg.Add(1)
 	wg.Wait()
 }
@@ -170,7 +172,9 @@ func main() {
 		w = io.MultiWriter(os.Stdout, tcpout)
 	}
 
-	scan := gryffin.NewScan(*method, url, *body, data.NewMemoryStore(), w)
+	gryffin.SetLogWriter(w)
+
+	scan := gryffin.NewScan(*method, url, *body)
 	scan.Logm("Main", "Started")
 
 	linkChannels(scan)

diff --git a/fuzzer/arachni/arachni.go b/fuzzer/arachni/arachni.go
@@ -53,6 +53,10 @@ func (s *Fuzzer) Fuzz(g *gryffin.Scan) (count int, err error) {
 
 	count = s.extract(g, string(output))
 
+	if err != nil {
+		return
+	}
+
 	g.Logm("Arachni.Scan", fmt.Sprintf("Arachni return %t", cmd.ProcessState.Success()))
 	return
 

diff --git a/fuzzer/sqlmap/sqlmap.go b/fuzzer/sqlmap/sqlmap.go
@@ -79,6 +79,10 @@ func (s *Fuzzer) Fuzz(g *gryffin.Scan) (count int, err error) {
 
 	output, err := cmd.Output()
 
+	if err != nil {
+		return
+	}
+
 	count = s.extract(g, string(output))
 
 	g.Logm("SQLMap.Scan", fmt.Sprintf("SQLMap return %t", cmd.ProcessState.Success()))

diff --git a/global.go b/global.go
@@ -0,0 +1,21 @@
+// Copyright 2015, Yahoo Inc. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gryffin
+
+import (
+	"io"
+	// "io/ioutil"
+)
+
+var memoryStore = NewGryffinStore(nil)
+var logWriter io.Writer
+
+func SetMemoryStore(m *GryffinStore) {
+	memoryStore = m
+}
+
+func SetLogWriter(w io.Writer) {
+	logWriter = w
+}
diff --git a/gryffin.go b/gryffin.go
@@ -12,16 +12,13 @@ import (
 	"encoding/json"
 	"fmt"
 	"hash/fnv"
-	"io"
 	"io/ioutil"
 	"net/http"
 	"net/http/cookiejar"
 	"net/url"
-	"strconv"
 	"strings"
 	"time"
 
-	"github.com/yahoo/gryffin/data"
 	"github.com/yahoo/gryffin/html-distance"
 )
 
@@ -38,8 +35,6 @@ type Scan struct {
 	Cookies      []*http.Cookie
 	Fingerprint  Fingerprint
 	HitCount     int
-	LogWriter    io.Writer
-	Session      data.Store
 }
 
 // Job stores the job id and config (if any).
@@ -84,10 +79,8 @@ type LogMessage struct {
 	Url    string
 }
 
-var memoryStore = NewGryffinStore(nil, nil)
-
 // NewScan creates a scan.
-func NewScan(method, url, post string, session data.Store, writer io.Writer) *Scan {
+func NewScan(method, url, post string) *Scan {
 
 	id := GenRandomID()
 
@@ -110,25 +103,12 @@ func NewScan(method, url, post string, session data.Store, writer io.Writer) *Sc
 		Job:         job,
 		Request:     req,
 		RequestBody: post,
-		Session:     session,
-		LogWriter:   writer,
-	}
-}
-
-func NewScanFromJson(b []byte, base *Scan) *Scan {
-	var scan Scan
-	json.Unmarshal(b, &scan)
-	if base != nil {
-		scan.LogWriter = base.LogWriter
-		scan.Session = base.Session
 	}
-	// fmt.Printf("Cookie: %v\n", scan.Cookies)
-	return &scan
 }
 
 // getOrigin returns the Origin of the URL (scheme, hostname, port )
 func getOrigin(u *url.URL) string {
-	return u.Scheme + u.Host
+	return u.Scheme + "://" + u.Host
 }
 
 // MergeRequest merge the request field in scan with the existing one.
@@ -143,8 +123,11 @@ func (s *Scan) MergeRequest(req *http.Request) {
 	// read the request body, and then reset the reader
 	var post []byte
 	if req.Body != nil {
-		if post, err := ioutil.ReadAll(req.Body); err != nil {
+		if post, err := ioutil.ReadAll(req.Body); err == nil {
 			req.Body = ioutil.NopCloser(bytes.NewReader(post))
+		} else {
+			// only possible error is bytes.ErrTooLarge from ioutil package.
+			s.Error("MergeRequest", err)
 		}
 	}
 
@@ -210,8 +193,6 @@ func (s *Scan) Spawn() *Scan {
 		Request:     &req,
 		RequestBody: post,
 		Cookies:     cookies,
-		Session:     s.Session,
-		LogWriter:   s.LogWriter,
 	}
 }
 
@@ -337,7 +318,7 @@ func (s *Scan) IsScanAllowed() bool {
 func (s *Scan) CrawlAsync(r Renderer) {
 	s.Logm("CrawlAsync", "Started")
 	if s.IsScanAllowed() {
-		go r.Do(s)
+		r.Do(s)
 	} else {
 		s.Logm("CrawlAsync", "Scan Not Allowed")
 	}
@@ -347,8 +328,8 @@ func (s *Scan) CrawlAsync(r Renderer) {
 func (s *Scan) IsDuplicatedPage() bool {
 	s.UpdateFingerprint()
 	f := s.Fingerprint.ResponseSimilarity
-	if !memoryStore.Seen(s, f, 2) {
-		memoryStore.See(s, f)
+	if !memoryStore.Seen(s.Job.ID, "oracle", f, 2) {
+		memoryStore.See(s.Job.ID, "oracle", f)
 		s.Logm("IsDuplicatedPage", "Unique Page")
 		return false
 	} else {
@@ -369,28 +350,19 @@ func (s *Scan) Fuzz(fuzzer Fuzzer) (int, error) {
 // 	return
 // }
 
-// ApplyLinkRules checks if the links should be queued for next crawl.
-func (s *Scan) ApplyLinkRules() bool {
+// ShouldCrawl checks if the links should be queued for next crawl.
+func (s *Scan) ShouldCrawl() bool {
 
 	s.UpdateFingerprint()
-	store := s.Session
-
-	// k := "hash/" + hex.EncodeToString(s.Fingerprint.URL)
-	k := "hash/url/" + strconv.FormatUint(s.Fingerprint.URL, 16)
-
-	// link seen before.
-	if _, ok := store.Get(k); ok {
-		// s.Logm("LinkRules", "Duplicated")
-		return false
+	f := s.Fingerprint.URL
+	if !memoryStore.Seen(s.Job.ID, "hash", f, 0) {
+		memoryStore.See(s.Job.ID, "hash", f)
+		s.Logm("ShouldCrawl", "Unique Link")
+		return true
+	} else {
+		s.Logm("ShouldCrawl", "Duplicate Link")
 	}
-	store.Set(k, true)
-	return true
-}
-
-func (s *Scan) Json() []byte {
-	b, _ := json.Marshal(s)
-	return b
-
+	return false
 }
 
 // TODO - LogFmt (fmt string)
@@ -423,9 +395,9 @@ func (s *Scan) Logf(format string, a ...interface{}) {
 }
 
 func (s *Scan) Log(v interface{}) {
-	if s.LogWriter == nil {
+	if logWriter == nil {
 		return
 	}
-	encoder := json.NewEncoder(s.LogWriter)
+	encoder := json.NewEncoder(logWriter)
 	encoder.Encode(v)
 }
-Original file line number
+Diff line change
@@ Expand Up @@
     	count = s.extract(g, string(output))
+    	if err != nil {
+    		return
+    	}
     	g.Logm("Arachni.Scan", fmt.Sprintf("Arachni return %t", cmd.ProcessState.Success()))
     	return
@@ Expand Down @@