# coding=utf-8 __author__ = 'zhanghe' """ å符å¤ççå·¥å ·: 夿unicodeæ¯å¦æ¯æ±åï¼æ°åï¼è±æï¼æè å ¶ä»å符ã å ¨è§ç¬¦å·è½¬åè§ç¬¦å·ã """ def is_alphabet(uchar): """ 夿ä¸ä¸ªunicodeæ¯å¦æ¯è±æåæ¯ """ if (u'\u0041' <= uchar <= u'\u005a') or (u'\u0061' <= uchar <= u'\u007a'): return True else: return False def is_chinese(uchar): """ 夿ä¸ä¸ªunicodeæ¯å¦æ¯æ±å """ if u'\u4e00' <= uchar <= u'\u9fa5': return True else: return False def is_number(uchar): """ 夿ä¸ä¸ªunicodeæ¯å¦æ¯æ°å """ if u'\u0030' <= uchar <= u'\u0039': return True else: return False def is_other(uchar): """ 夿æ¯å¦éæ±åï¼æ°ååè±æå符 """ if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)): return True else: return False def b2q(uchar): """ åè§è½¬å ¨è§ """ inside_code = ord(uchar) if inside_code < 0x0020 or inside_code > 0x7e: # 䏿¯åè§å符就è¿å忥çå符 return uchar if inside_code == 0x0020: # é¤äºç©ºæ ¼å ¶ä»çå ¨è§åè§çå ¬å¼ä¸º:åè§=å ¨è§-0xfee0 inside_code = 0x3000 else: inside_code += 0xfee0 return unichr(inside_code) def q2b(uchar): """ å ¨è§è½¬åè§ """ inside_code = ord(uchar) if inside_code == 0x3000: inside_code = 0x0020 else: inside_code -= 0xfee0 if inside_code < 0x0020 or inside_code > 0x7e: # 转å®ä¹å䏿¯åè§å符è¿å忥çå符 return uchar return unichr(inside_code) def string_q2b(ustring): """ æåç¬¦ä¸²å ¨è§è½¬åè§ """ return "".join([q2b(uchar) for uchar in ustring]) def uniform(ustring): """ æ ¼å¼åå符串ï¼å®æå ¨è§è½¬åè§ï¼å¤§å转å°åçå·¥ä½ """ return string_q2b(ustring).lower() def string2list(ustring): """ å°ustringæç §ä¸æï¼åæ¯ï¼æ°ååå¼ """ ret_list = [] u_tmp = [] for uchar in ustring: if is_other(uchar): if len(u_tmp) == 0: continue else: ret_list.append("".join(u_tmp)) u_tmp = [] else: u_tmp.append(uchar) if len(u_tmp) != 0: ret_list.append("".join(u_tmp)) return ret_list def get_first_char(s): """ è·åå符串é¦åæ¯æ° (妿æ¯ä¸æ è·åæ¼é³é¦åæ¯) """ if s is None or s == '': return '' s = s.decode('utf-8', 'ignore') # 妿å符串é¦å符æ¯åæ¯ï¼ç´æ¥è¿å first_char = ord(s[0].upper()) # print first_char if ord('A') <= first_char <= ord('Z'): return s[0].upper() # å¤çæ±åæ åµ s = s.encode('gb18030', 'ignore') # print len(s) if len(s) < 2: return '' asc = ord(s[0])*256 + ord(s[1])-65536 if -20319 <= asc <= -20284: return 'A' if -20283 <= asc <= -19776: return 'B' if -19775 <= asc <= -19219: return 'C' if -19218 <= asc <= -18711: return 'D' if -18710 <= asc <= -18527: return 'E' if -18526 <= asc <= -18240: return 'F' if -18239 <= asc <= -17923: return 'G' if -17922 <= asc <= -17418: return 'H' if -17417 <= asc <= -16475: return 'J' if -16474 <= asc <= -16213: return 'K' if -16212 <= asc <= -15641: return 'L' if -15640 <= asc <= -15166: return 'M' if -15165 <= asc <= -14923: return 'N' if -14922 <= asc <= -14915: return 'O' if -14914 <= asc <= -14631: return 'P' if -14630 <= asc <= -14150: return 'Q' if -14149 <= asc <= -14091: return 'R' if -14090 <= asc <= -13319: return 'S' if -13318 <= asc <= -12839: return 'T' if -12838 <= asc <= -12557: return 'W' if -12556 <= asc <= -11848: return 'X' if -11847 <= asc <= -11056: return 'Y' if -11055 <= asc <= -10247: return 'Z' return '' if __name__ == '__main__': print is_chinese(u'ä½ å¥½å') # True print is_chinese(u'ä½ abc') # True print is_chinese(u'abcä½ ') # False print get_first_char('abcä½ ') # A print get_first_char('q') # Q print get_first_char('ä½ å¥½') # N