# coding=utf-8 __author__ = 'zhanghe' """ å—ç¬¦å¤„ç†çš„å·¥å…·: åˆ¤æ–unicodeæ˜¯å¦æ˜¯æ±‰å—ï¼Œæ•°å—ï¼Œè‹±æ–‡ï¼Œæˆ–è€…å…¶ä»–å—ç¬¦ã€‚ å…¨è§’ç¬¦å·è½¬åŠè§’ç¬¦å·ã€‚ """ def is_alphabet(uchar): """ åˆ¤æ–ä¸€ä¸ªunicodeæ˜¯å¦æ˜¯è‹±æ–‡å—æ¯ """ if (u'\u0041' <= uchar <= u'\u005a') or (u'\u0061' <= uchar <= u'\u007a'): return True else: return False def is_chinese(uchar): """ åˆ¤æ–ä¸€ä¸ªunicodeæ˜¯å¦æ˜¯æ±‰å— """ if u'\u4e00' <= uchar <= u'\u9fa5': return True else: return False def is_number(uchar): """ åˆ¤æ–ä¸€ä¸ªunicodeæ˜¯å¦æ˜¯æ•°å— """ if u'\u0030' <= uchar <= u'\u0039': return True else: return False def is_other(uchar): """ åˆ¤æ–æ˜¯å¦éžæ±‰å—ï¼Œæ•°å—å’Œè‹±æ–‡å—ç¬¦ """ if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)): return True else: return False def b2q(uchar): """ åŠè§’è½¬å…¨è§’ """ inside_code = ord(uchar) if inside_code < 0x0020 or inside_code > 0x7e: # ä¸æ˜¯åŠè§’å—ç¬¦å°±è¿”å›žåŽŸæ¥çš„å—ç¬¦ return uchar if inside_code == 0x0020: # é™¤äº†ç©ºæ ¼å…¶ä»–çš„å…¨è§’åŠè§’çš„å…¬å¼ä¸º:åŠè§’=å…¨è§’-0xfee0 inside_code = 0x3000 else: inside_code += 0xfee0 return unichr(inside_code) def q2b(uchar): """ å…¨è§’è½¬åŠè§’ """ inside_code = ord(uchar) if inside_code == 0x3000: inside_code = 0x0020 else: inside_code -= 0xfee0 if inside_code < 0x0020 or inside_code > 0x7e: # è½¬å®Œä¹‹åŽä¸æ˜¯åŠè§’å—ç¬¦è¿”å›žåŽŸæ¥çš„å—ç¬¦ return uchar return unichr(inside_code) def string_q2b(ustring): """ æŠŠå—ç¬¦ä¸²å…¨è§’è½¬åŠè§’ """ return "".join([q2b(uchar) for uchar in ustring]) def uniform(ustring): """ æ ¼å¼åŒ–å—ç¬¦ä¸²ï¼Œå®Œæˆå…¨è§’è½¬åŠè§’ï¼Œå¤§å†™è½¬å°å†™çš„å·¥ä½œ """ return string_q2b(ustring).lower() def string2list(ustring): """ å°†ustringæŒ‰ç…§ä¸æ–‡ï¼Œå—æ¯ï¼Œæ•°å—åˆ†å¼€ """ ret_list = [] u_tmp = [] for uchar in ustring: if is_other(uchar): if len(u_tmp) == 0: continue else: ret_list.append("".join(u_tmp)) u_tmp = [] else: u_tmp.append(uchar) if len(u_tmp) != 0: ret_list.append("".join(u_tmp)) return ret_list def get_first_char(s): """ èŽ·å–å—ç¬¦ä¸²é¦–å—æ¯æ•° (å¦‚æžœæ˜¯ä¸æ–‡ èŽ·å–æ‹¼éŸ³é¦–å—æ¯) """ if s is None or s == '': return '' s = s.decode('utf-8', 'ignore') # å¦‚æžœå—ç¬¦ä¸²é¦–å—ç¬¦æ˜¯å—æ¯ï¼Œç›´æŽ¥è¿”å›ž first_char = ord(s[0].upper()) # print first_char if ord('A') <= first_char <= ord('Z'): return s[0].upper() # å¤„ç†æ±‰å—æƒ…å†µ s = s.encode('gb18030', 'ignore') # print len(s) if len(s) < 2: return '' asc = ord(s[0])*256 + ord(s[1])-65536 if -20319 <= asc <= -20284: return 'A' if -20283 <= asc <= -19776: return 'B' if -19775 <= asc <= -19219: return 'C' if -19218 <= asc <= -18711: return 'D' if -18710 <= asc <= -18527: return 'E' if -18526 <= asc <= -18240: return 'F' if -18239 <= asc <= -17923: return 'G' if -17922 <= asc <= -17418: return 'H' if -17417 <= asc <= -16475: return 'J' if -16474 <= asc <= -16213: return 'K' if -16212 <= asc <= -15641: return 'L' if -15640 <= asc <= -15166: return 'M' if -15165 <= asc <= -14923: return 'N' if -14922 <= asc <= -14915: return 'O' if -14914 <= asc <= -14631: return 'P' if -14630 <= asc <= -14150: return 'Q' if -14149 <= asc <= -14091: return 'R' if -14090 <= asc <= -13319: return 'S' if -13318 <= asc <= -12839: return 'T' if -12838 <= asc <= -12557: return 'W' if -12556 <= asc <= -11848: return 'X' if -11847 <= asc <= -11056: return 'Y' if -11055 <= asc <= -10247: return 'Z' return '' if __name__ == '__main__': print is_chinese(u'ä½ å¥½å•Š') # True print is_chinese(u'ä½ abc') # True print is_chinese(u'abcä½ ') # False print get_first_char('abcä½ ') # A print get_first_char('q') # Q print get_first_char('ä½ å¥½') # N