forked from boyter/cs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
file.go
223 lines (192 loc) · 5.42 KB
/
file.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
// SPDX-License-Identifier: MIT OR Unlicense
package main
import (
"bytes"
"errors"
"fmt"
"github.com/boyter/gocodewalker"
"github.com/lithammer/fuzzysearch/fuzzy"
"os"
"runtime"
"sync"
"sync/atomic"
)
var dirFilePaths = []string{}
var searchToFileMatchesCache = map[string][]string{}
//var searchToFileMatchesCacheMutex = sync.Mutex{}
func FindFiles(query string) chan *gocodewalker.File {
// TODO enable this again as it seems to have issues
//searchToFileMatchesCacheMutex.Lock()
//defer searchToFileMatchesCacheMutex.Unlock()
//
//// get the keys for the cache
//var keys []string
//for k := range searchToFileMatchesCache {
// keys = append(keys, k)
//}
//
//// clear from the cache anything longer than the search since its will not help us
//for _, k := range keys {
// if len(k) > len(query) || query[0] != k[0] { // if cached is longer OR the first char does not match...
// delete(searchToFileMatchesCache, k)
// }
//}
//
//// check if the files we expect are in the cache...
//files := make(chan *gocodewalker.File, 100000)
//for i := len(query); i > 0; i-- {
// r, ok := searchToFileMatchesCache[query[:i]]
// if ok {
// go func() {
// for _, f := range r {
// files <- &gocodewalker.File{Location: f}
// }
// close(files)
// }()
// return files
// }
//}
return walkFiles()
}
func walkFiles() chan *gocodewalker.File {
// Now we need to run through every file closed by the filewalker when done
fileListQueue := make(chan *gocodewalker.File, 1000)
if FindRoot {
dirFilePaths[0] = gocodewalker.FindRepositoryRoot(dirFilePaths[0])
}
fileWalker := gocodewalker.NewFileWalker(dirFilePaths[0], fileListQueue)
fileWalker.AllowListExtensions = AllowListExtensions
fileWalker.IgnoreIgnoreFile = IgnoreIgnoreFile
fileWalker.IgnoreGitIgnore = IgnoreGitIgnore
fileWalker.LocationExcludePattern = LocationExcludePattern
go func() { _ = fileWalker.Start() }()
return fileListQueue
}
// Reads the supplied file into memory, but only up to a certain size
func readFileContent(f *gocodewalker.File) []byte {
fi, err := os.Lstat(f.Location)
if err != nil {
return nil
}
var content []byte
// Only read up to point of a file because anything beyond that is probably pointless
if fi.Size() < MaxReadSizeBytes {
var err error
content, err = os.ReadFile(f.Location)
if err != nil {
return nil
}
} else {
fil, err := os.Open(f.Location)
if err != nil {
return nil
}
defer fil.Close()
byteSlice := make([]byte, MaxReadSizeBytes)
_, err = fil.Read(byteSlice)
if err != nil {
return nil
}
content = byteSlice
}
return content
}
// Given a file to read will read the contents into memory and determine if we should process it
// based on checks such as if its binary or minified
func processFile(f *gocodewalker.File) ([]byte, error) {
content := readFileContent(f)
if len(content) == 0 {
if Verbose {
fmt.Printf("empty file so moving on %s\n", f.Location)
}
return nil, errors.New("empty file so moving on")
}
// Check if this file is binary by checking for nul byte and if so bail out
// this is how GNU Grep, git and ripgrep binaryCheck for binary files
if !IncludeBinaryFiles {
isBinary := false
binaryCheck := content
if len(binaryCheck) > 10_000 {
binaryCheck = content[:10_000]
}
if bytes.IndexByte(binaryCheck, 0) != -1 {
isBinary = true
}
if isBinary {
if Verbose {
fmt.Printf("file determined to be binary so moving on %s\n", f.Location)
}
return nil, errors.New("binary file")
}
}
if !IncludeMinified {
// Check if this file is minified
// Check if the file is minified and if so ignore it
split := bytes.Split(content, []byte("\n"))
sumLineLength := 0
for _, s := range split {
sumLineLength += len(s)
}
averageLineLength := sumLineLength / len(split)
if averageLineLength > MinifiedLineByteLength {
if Verbose {
fmt.Printf("file determined to be minified so moving on %s", f.Location)
}
return nil, errors.New("file determined to be minified")
}
}
return content, nil
}
// FileReaderWorker reads files from disk in parallel
type FileReaderWorker struct {
input chan *gocodewalker.File
output chan *FileJob
fileCount int64 // Count of the number of files that have been read
InstanceId int
MaxReadSizeBytes int64
FuzzyMatch string
}
func NewFileReaderWorker(input chan *gocodewalker.File, output chan *FileJob) *FileReaderWorker {
return &FileReaderWorker{
input: input,
output: output,
fileCount: 0,
MaxReadSizeBytes: 1_000_000, // sensible default of 1MB
}
}
func (f *FileReaderWorker) GetFileCount() int64 {
return atomic.LoadInt64(&f.fileCount)
}
// Start is responsible for spinning up jobs
// that read files from disk into memory
func (f *FileReaderWorker) Start() {
var wg sync.WaitGroup
for i := 0; i < maxInt(2, runtime.NumCPU()); i++ {
wg.Add(1)
go func() {
for res := range f.input {
if f.FuzzyMatch != "" {
if !fuzzy.MatchFold(f.FuzzyMatch, res.Filename) {
continue
}
}
fil, err := processFile(res)
if err == nil {
atomic.AddInt64(&f.fileCount, 1)
f.output <- &FileJob{
Filename: res.Filename,
Extension: "",
Location: res.Location,
Content: fil,
Bytes: len(fil),
Score: 0,
MatchLocations: map[string][][]int{},
}
}
}
wg.Done()
}()
}
wg.Wait()
close(f.output)
}