-
Notifications
You must be signed in to change notification settings - Fork 27
/
words.go
86 lines (75 loc) · 1.97 KB
/
words.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
package strutil
import "unicode"
// CountWords count the words, It uses the same base function with 'Words'
// function. only difference is CountWords doesn't allocate an array so
// it is faster and more memory efficient
func CountWords(str string) int {
_, count := words(str, true)
return count
}
// Words returns the words inside the text.
// - Numbers are counted as words
// - If they are inside a word these punctuations don't break a word: ', -, _
func Words(str string) []string {
arr, _ := words(str, false)
return arr
}
const (
wordRune = iota
wordPuncRune
nonWordRune
)
// wordPuncRunes are punctuations which can be inside words: O'Neil, micro-service
var wordPuncRunes = [...]rune{rune('\''), rune('-'), rune('_')}
func inWordPuncRune(r rune) bool {
for _, p := range wordPuncRunes {
if r == p {
return true
}
}
return false
}
// words is the ugly base function for Word and CountWords. It returns words
// and the count of the words. If onlyCount is true only count is returned,
// no array is created.
func words(str string, onlyCount bool) ([]string, int) {
var arr []string
if !onlyCount {
arr = make([]string, 0, len(str)/4) //TODO search for better start size
}
var prevCat = nonWordRune
var lastStart = -1
var count = 0
for i, r := range str {
var cat int
switch {
case unicode.IsLetter(r) || unicode.IsDigit(r):
cat = wordRune
case inWordPuncRune(r):
//faster: case r == wordPuncRunes[0] || r == wordPuncRunes[1] || r == wordPuncRunes[2]:
cat = wordPuncRune
default:
cat = nonWordRune
}
switch {
//start word
case cat == wordRune && prevCat != wordRune && lastStart == -1:
lastStart = i
//end word
case cat == nonWordRune && (prevCat == wordRune || prevCat == wordPuncRune) && lastStart >= 0:
if !onlyCount {
arr = append(arr, str[lastStart:i])
}
lastStart = -1
count++
}
prevCat = cat
}
if lastStart >= 0 {
if !onlyCount {
arr = append(arr, str[lastStart:])
}
count++
}
return arr, count
}