Skip to content

Commit 1ac2870

Browse files
authored
Add files via upload
1 parent 7684c46 commit 1ac2870

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+38049
-0
lines changed

venv/Lib/site-packages/pip/_vendor/chardet/big5freq.py

Lines changed: 386 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
######################## BEGIN LICENSE BLOCK ########################
2+
# The Original Code is Mozilla Communicator client code.
3+
#
4+
# The Initial Developer of the Original Code is
5+
# Netscape Communications Corporation.
6+
# Portions created by the Initial Developer are Copyright (C) 1998
7+
# the Initial Developer. All Rights Reserved.
8+
#
9+
# Contributor(s):
10+
# Mark Pilgrim - port to Python
11+
#
12+
# This library is free software; you can redistribute it and/or
13+
# modify it under the terms of the GNU Lesser General Public
14+
# License as published by the Free Software Foundation; either
15+
# version 2.1 of the License, or (at your option) any later version.
16+
#
17+
# This library is distributed in the hope that it will be useful,
18+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
19+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20+
# Lesser General Public License for more details.
21+
#
22+
# You should have received a copy of the GNU Lesser General Public
23+
# License along with this library; if not, write to the Free Software
24+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25+
# 02110-1301 USA
26+
######################### END LICENSE BLOCK #########################
27+
28+
from .mbcharsetprober import MultiByteCharSetProber
29+
from .codingstatemachine import CodingStateMachine
30+
from .chardistribution import Big5DistributionAnalysis
31+
from .mbcssm import BIG5_SM_MODEL
32+
33+
34+
class Big5Prober(MultiByteCharSetProber):
35+
def __init__(self):
36+
super(Big5Prober, self).__init__()
37+
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
38+
self.distribution_analyzer = Big5DistributionAnalysis()
39+
self.reset()
40+
41+
@property
42+
def charset_name(self):
43+
return "Big5"
44+
45+
@property
46+
def language(self):
47+
return "Chinese"
Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
######################## BEGIN LICENSE BLOCK ########################
2+
# The Original Code is Mozilla Communicator client code.
3+
#
4+
# The Initial Developer of the Original Code is
5+
# Netscape Communications Corporation.
6+
# Portions created by the Initial Developer are Copyright (C) 1998
7+
# the Initial Developer. All Rights Reserved.
8+
#
9+
# Contributor(s):
10+
# Mark Pilgrim - port to Python
11+
#
12+
# This library is free software; you can redistribute it and/or
13+
# modify it under the terms of the GNU Lesser General Public
14+
# License as published by the Free Software Foundation; either
15+
# version 2.1 of the License, or (at your option) any later version.
16+
#
17+
# This library is distributed in the hope that it will be useful,
18+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
19+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20+
# Lesser General Public License for more details.
21+
#
22+
# You should have received a copy of the GNU Lesser General Public
23+
# License along with this library; if not, write to the Free Software
24+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25+
# 02110-1301 USA
26+
######################### END LICENSE BLOCK #########################
27+
28+
from .euctwfreq import (EUCTW_CHAR_TO_FREQ_ORDER, EUCTW_TABLE_SIZE,
29+
EUCTW_TYPICAL_DISTRIBUTION_RATIO)
30+
from .euckrfreq import (EUCKR_CHAR_TO_FREQ_ORDER, EUCKR_TABLE_SIZE,
31+
EUCKR_TYPICAL_DISTRIBUTION_RATIO)
32+
from .gb2312freq import (GB2312_CHAR_TO_FREQ_ORDER, GB2312_TABLE_SIZE,
33+
GB2312_TYPICAL_DISTRIBUTION_RATIO)
34+
from .big5freq import (BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE,
35+
BIG5_TYPICAL_DISTRIBUTION_RATIO)
36+
from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE,
37+
JIS_TYPICAL_DISTRIBUTION_RATIO)
38+
39+
40+
class CharDistributionAnalysis(object):
41+
ENOUGH_DATA_THRESHOLD = 1024
42+
SURE_YES = 0.99
43+
SURE_NO = 0.01
44+
MINIMUM_DATA_THRESHOLD = 3
45+
46+
def __init__(self):
47+
# Mapping table to get frequency order from char order (get from
48+
# GetOrder())
49+
self._char_to_freq_order = None
50+
self._table_size = None # Size of above table
51+
# This is a constant value which varies from language to language,
52+
# used in calculating confidence. See
53+
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
54+
# for further detail.
55+
self.typical_distribution_ratio = None
56+
self._done = None
57+
self._total_chars = None
58+
self._freq_chars = None
59+
self.reset()
60+
61+
def reset(self):
62+
"""reset analyser, clear any state"""
63+
# If this flag is set to True, detection is done and conclusion has
64+
# been made
65+
self._done = False
66+
self._total_chars = 0 # Total characters encountered
67+
# The number of characters whose frequency order is less than 512
68+
self._freq_chars = 0
69+
70+
def feed(self, char, char_len):
71+
"""feed a character with known length"""
72+
if char_len == 2:
73+
# we only care about 2-bytes character in our distribution analysis
74+
order = self.get_order(char)
75+
else:
76+
order = -1
77+
if order >= 0:
78+
self._total_chars += 1
79+
# order is valid
80+
if order < self._table_size:
81+
if 512 > self._char_to_freq_order[order]:
82+
self._freq_chars += 1
83+
84+
def get_confidence(self):
85+
"""return confidence based on existing data"""
86+
# if we didn't receive any character in our consideration range,
87+
# return negative answer
88+
if self._total_chars <= 0 or self._freq_chars <= self.MINIMUM_DATA_THRESHOLD:
89+
return self.SURE_NO
90+
91+
if self._total_chars != self._freq_chars:
92+
r = (self._freq_chars / ((self._total_chars - self._freq_chars)
93+
* self.typical_distribution_ratio))
94+
if r < self.SURE_YES:
95+
return r
96+
97+
# normalize confidence (we don't want to be 100% sure)
98+
return self.SURE_YES
99+
100+
def got_enough_data(self):
101+
# It is not necessary to receive all data to draw conclusion.
102+
# For charset detection, certain amount of data is enough
103+
return self._total_chars > self.ENOUGH_DATA_THRESHOLD
104+
105+
def get_order(self, byte_str):
106+
# We do not handle characters based on the original encoding string,
107+
# but convert this encoding string to a number, here called order.
108+
# This allows multiple encodings of a language to share one frequency
109+
# table.
110+
return -1
111+
112+
113+
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
114+
def __init__(self):
115+
super(EUCTWDistributionAnalysis, self).__init__()
116+
self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
117+
self._table_size = EUCTW_TABLE_SIZE
118+
self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
119+
120+
def get_order(self, byte_str):
121+
# for euc-TW encoding, we are interested
122+
# first byte range: 0xc4 -- 0xfe
123+
# second byte range: 0xa1 -- 0xfe
124+
# no validation needed here. State machine has done that
125+
first_char = byte_str[0]
126+
if first_char >= 0xC4:
127+
return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
128+
else:
129+
return -1
130+
131+
132+
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
133+
def __init__(self):
134+
super(EUCKRDistributionAnalysis, self).__init__()
135+
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
136+
self._table_size = EUCKR_TABLE_SIZE
137+
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
138+
139+
def get_order(self, byte_str):
140+
# for euc-KR encoding, we are interested
141+
# first byte range: 0xb0 -- 0xfe
142+
# second byte range: 0xa1 -- 0xfe
143+
# no validation needed here. State machine has done that
144+
first_char = byte_str[0]
145+
if first_char >= 0xB0:
146+
return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
147+
else:
148+
return -1
149+
150+
151+
class GB2312DistributionAnalysis(CharDistributionAnalysis):
152+
def __init__(self):
153+
super(GB2312DistributionAnalysis, self).__init__()
154+
self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
155+
self._table_size = GB2312_TABLE_SIZE
156+
self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
157+
158+
def get_order(self, byte_str):
159+
# for GB2312 encoding, we are interested
160+
# first byte range: 0xb0 -- 0xfe
161+
# second byte range: 0xa1 -- 0xfe
162+
# no validation needed here. State machine has done that
163+
first_char, second_char = byte_str[0], byte_str[1]
164+
if (first_char >= 0xB0) and (second_char >= 0xA1):
165+
return 94 * (first_char - 0xB0) + second_char - 0xA1
166+
else:
167+
return -1
168+
169+
170+
class Big5DistributionAnalysis(CharDistributionAnalysis):
171+
def __init__(self):
172+
super(Big5DistributionAnalysis, self).__init__()
173+
self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
174+
self._table_size = BIG5_TABLE_SIZE
175+
self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
176+
177+
def get_order(self, byte_str):
178+
# for big5 encoding, we are interested
179+
# first byte range: 0xa4 -- 0xfe
180+
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
181+
# no validation needed here. State machine has done that
182+
first_char, second_char = byte_str[0], byte_str[1]
183+
if first_char >= 0xA4:
184+
if second_char >= 0xA1:
185+
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
186+
else:
187+
return 157 * (first_char - 0xA4) + second_char - 0x40
188+
else:
189+
return -1
190+
191+
192+
class SJISDistributionAnalysis(CharDistributionAnalysis):
193+
def __init__(self):
194+
super(SJISDistributionAnalysis, self).__init__()
195+
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
196+
self._table_size = JIS_TABLE_SIZE
197+
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
198+
199+
def get_order(self, byte_str):
200+
# for sjis encoding, we are interested
201+
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
202+
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
203+
# no validation needed here. State machine has done that
204+
first_char, second_char = byte_str[0], byte_str[1]
205+
if (first_char >= 0x81) and (first_char <= 0x9F):
206+
order = 188 * (first_char - 0x81)
207+
elif (first_char >= 0xE0) and (first_char <= 0xEF):
208+
order = 188 * (first_char - 0xE0 + 31)
209+
else:
210+
return -1
211+
order = order + second_char - 0x40
212+
if second_char > 0x7F:
213+
order = -1
214+
return order
215+
216+
217+
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
218+
def __init__(self):
219+
super(EUCJPDistributionAnalysis, self).__init__()
220+
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
221+
self._table_size = JIS_TABLE_SIZE
222+
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
223+
224+
def get_order(self, byte_str):
225+
# for euc-JP encoding, we are interested
226+
# first byte range: 0xa0 -- 0xfe
227+
# second byte range: 0xa1 -- 0xfe
228+
# no validation needed here. State machine has done that
229+
char = byte_str[0]
230+
if char >= 0xA0:
231+
return 94 * (char - 0xA1) + byte_str[1] - 0xa1
232+
else:
233+
return -1
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
######################## BEGIN LICENSE BLOCK ########################
2+
# The Original Code is Mozilla Communicator client code.
3+
#
4+
# The Initial Developer of the Original Code is
5+
# Netscape Communications Corporation.
6+
# Portions created by the Initial Developer are Copyright (C) 1998
7+
# the Initial Developer. All Rights Reserved.
8+
#
9+
# Contributor(s):
10+
# Mark Pilgrim - port to Python
11+
#
12+
# This library is free software; you can redistribute it and/or
13+
# modify it under the terms of the GNU Lesser General Public
14+
# License as published by the Free Software Foundation; either
15+
# version 2.1 of the License, or (at your option) any later version.
16+
#
17+
# This library is distributed in the hope that it will be useful,
18+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
19+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20+
# Lesser General Public License for more details.
21+
#
22+
# You should have received a copy of the GNU Lesser General Public
23+
# License along with this library; if not, write to the Free Software
24+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25+
# 02110-1301 USA
26+
######################### END LICENSE BLOCK #########################
27+
28+
from .enums import ProbingState
29+
from .charsetprober import CharSetProber
30+
31+
32+
class CharSetGroupProber(CharSetProber):
33+
def __init__(self, lang_filter=None):
34+
super(CharSetGroupProber, self).__init__(lang_filter=lang_filter)
35+
self._active_num = 0
36+
self.probers = []
37+
self._best_guess_prober = None
38+
39+
def reset(self):
40+
super(CharSetGroupProber, self).reset()
41+
self._active_num = 0
42+
for prober in self.probers:
43+
if prober:
44+
prober.reset()
45+
prober.active = True
46+
self._active_num += 1
47+
self._best_guess_prober = None
48+
49+
@property
50+
def charset_name(self):
51+
if not self._best_guess_prober:
52+
self.get_confidence()
53+
if not self._best_guess_prober:
54+
return None
55+
return self._best_guess_prober.charset_name
56+
57+
@property
58+
def language(self):
59+
if not self._best_guess_prober:
60+
self.get_confidence()
61+
if not self._best_guess_prober:
62+
return None
63+
return self._best_guess_prober.language
64+
65+
def feed(self, byte_str):
66+
for prober in self.probers:
67+
if not prober:
68+
continue
69+
if not prober.active:
70+
continue
71+
state = prober.feed(byte_str)
72+
if not state:
73+
continue
74+
if state == ProbingState.FOUND_IT:
75+
self._best_guess_prober = prober
76+
self._state = ProbingState.FOUND_IT
77+
return self.state
78+
elif state == ProbingState.NOT_ME:
79+
prober.active = False
80+
self._active_num -= 1
81+
if self._active_num <= 0:
82+
self._state = ProbingState.NOT_ME
83+
return self.state
84+
return self.state
85+
86+
def get_confidence(self):
87+
state = self.state
88+
if state == ProbingState.FOUND_IT:
89+
return 0.99
90+
elif state == ProbingState.NOT_ME:
91+
return 0.01
92+
best_conf = 0.0
93+
self._best_guess_prober = None
94+
for prober in self.probers:
95+
if not prober:
96+
continue
97+
if not prober.active:
98+
self.logger.debug('%s not active', prober.charset_name)
99+
continue
100+
conf = prober.get_confidence()
101+
self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf)
102+
if best_conf < conf:
103+
best_conf = conf
104+
self._best_guess_prober = prober
105+
if not self._best_guess_prober:
106+
return 0.0
107+
return best_conf

0 commit comments

Comments
 (0)