我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用chardet.detect()。
def force_unicode(s): """Do all kinds of magic to turn `s` into unicode""" # It's already unicode, don't do anything: #if isinstance(s, six.text_type): # return s # Try some default encodings: try: return s.decode('utf-8') except UnicodeDecodeError as exc: pass try: return s.decode(locale.getpreferredencoding()) except: return (_('Unicode conversion error')) if chardet is not None: # Try chardet, if available encoding = chardet.detect(s)['encoding'] if encoding is not None: return s.decode(encoding) raise # Give up.
def skipwrap(para): # If the text begins with four spaces or one tab, it's a code block; don't wrap if para[0:4] == ' ' or para[0] == '\t': return True # If the text begins with only two "--", possibly preceded by whitespace, that's # an emdash; so wrap. stripped = para.lstrip() if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-": return False # I'm not sure what this is for; I thought it was to detect lists, but there's # a <br>-inside-<span> case in one of the tests that also depends upon it. if stripped[0:1] == '-' or stripped[0:1] == '*': return True # If the text begins with a single -, *, or +, followed by a space, or an integer, # followed by a ., followed by a space (in either case optionally preceeded by # whitespace), it's a list; don't wrap. if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped): return True return False
def __getPage(self,url): http = urllib3.PoolManager( cert_reqs='CERT_REQUIRED', # Force certificate check. ca_certs=certifi.where(), # Path to the Certifi bundle. ) data = '' try: data = http.request('GET', url, timeout=10, headers={ 'User-agent' : 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5'} ).data codeType = chardet.detect(data) data = data.decode(codeType['encoding']) except: pass return data
def debug_page(): headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0' } url = 'http://m.qfang.com/guangzhou/rent/100001468?gardenId=1109818' r = requests.get(url=url, headers=headers) #r.encoding='gbk' print r.status_code print type(r.content) print r.content #print chardet.detect(r) tree = etree.HTML(r.text,parser=etree.HTMLParser(encoding='utf-8')) #print etree.tostring(tree) return tree,r.text # ????????header??
def strtodecode(s=''): #????????unicode if not s: return False tempstr = s try: chardetdict = chardet.detect(tempstr) except: pass else: try: if tempstr.decode('utf-8','ignore').encode('utf-8') == tempstr: templabelstrdecode = tempstr.decode('utf-8','ignore').encode('utf-8').decode('utf-8') elif tempstr.decode('gbk','ignore').encode('gbk') == tempstr: templabelstrdecode = tempstr.decode('gbk','ignore').encode('utf-8').decode('utf-8') else: templabelstrdecode = tempstr.decode(chardetdict['encoding'],'ignore').encode('utf-8').decode('utf-8') except: pass else: tempstr = templabelstrdecode return tempstr
def extract_meta(html): ''' ????meta??????????? ''' if chardet.detect(html)['encoding'] == 'utf-8': html = html.decode('utf-8') meta_list = [] # ??html?meta??? page = etree.HTML(html.lower()) xpath_result = page.xpath(u"//meta/@content") for once_xpath_result in xpath_result: # ??????????? if zh_check(once_xpath_result) == True: meta_list.append(utf8_transfer(once_xpath_result).decode('utf-8')) if meta_list != []: return meta_list else: return False
def validation(schema): """function decorator""" def dec(func): def wrapper(self, *args, **kwargs): try: rawdata = request.data enc = chardet.detect(rawdata) data = rawdata.decode(enc['encoding']) json_input = json.loads(data) jsonschema.validate(json_input, schema) json_input = prepare_input(json_input) except: raise BadRequest('JSON input not valid: {}'.format(format_exc())) return func(self, json_input, *args, **kwargs) return wrapper return dec
def autodecode(b): """ Try to decode ``bytes`` to text - try default encoding first, otherwise try to autodetect Args: b (bytes): byte string Returns: str: decoded text string """ import warnings import chardet try: return b.decode() except UnicodeError: result = chardet.detect(b) if result['confidence'] < 0.95: warnings.warn('autodecode failed with utf-8; guessing %s' % result['encoding']) return result.decode(result['encoding'])
def parsemail(raw_message): """Parse message headers, then remove BCC header.""" message = email.parser.Parser().parsestr(raw_message) # Detect encoding detected = chardet.detect(bytearray(raw_message, "utf-8")) encoding = detected["encoding"] message.set_charset(encoding) print(">>> encoding {}".format(encoding)) # Extract recipients addrs = email.utils.getaddresses(message.get_all("TO", [])) + \ email.utils.getaddresses(message.get_all("CC", [])) + \ email.utils.getaddresses(message.get_all("BCC", [])) recipients = [x[1] for x in addrs] message.__delitem__("bcc") text = message.as_string() sender = message["from"] return (text, sender, recipients)
def make_utf8(text, encoding): """Convert a text to UTF-8, brute-force.""" try: u = unicode(text, 'utf-8') uenc = 'utf-8' except UnicodeError: try: u = unicode(text, encoding) uenc = 'utf-8' except UnicodeError: u = unicode(text, 'iso-8859-15', 'ignore') uenc = 'iso-8859-15' try: import chardet except ImportError: return u.encode('utf-8') d = chardet.detect(text) if d['encoding'] == uenc: return u.encode('utf-8') return unicode(text, d['encoding'], 'ignore').encode('utf-8')
def caculation(self): df_t=ts.get_today_all() print self.df[u'??'].values for i in self.df[u'??'].values: name=self.base[self.base['code']==i]['name'].values[0] print name t=name.decode('utf-8') print print type(t) #print chardet.detect(t) self.df.ix[self.df[u'??']==i,u'????']=self.today #t=ts.get_k_data(i) pchange=df_t.ix[df_t['code']==i,'changepercent'].values[0] print pchange self.df.ix[self.df[u'??']==i,u'????']=pchange current=df_t[df_t['code']==i]['trade'].values[0] self.df.ix[self.df[u'??']==i,u'????']=current current_profit=(current-self.df[self.df[u'??']==i][u'????'].values[0])/self.df[self.df[u'??']==i][u'????'].values[0] self.df.ix[self.df[u'??']==i,u'????']=round(current_profit*100,2) print current_profit print self.df self.df.to_excel(self.name,encoding='utf-8')
def read_and_train(root_dir, fileids, output_file='', options={}): fileids = fileids if isinstance(fileids, list) else [fileids] fileids = [unicode(f, 'utf8') for f in fileids] output_file = output_file or '-'.join(fileids) output_file = u"{0}{1}-{2}".format(MODELS_DIR, output_file, options_to_string(options)) reader = PlaintextCorpusReader(root=root_dir, fileids=fileids) try: sents = reader.sents() print fileids train_and_save(sents, output_file, options) except UnicodeDecodeError: print "here" file_encodings = {} for fileid in fileids: file_content = open(root_dir + fileid).read() file_encoding = chardet.detect(file_content) file_encodings[fileid] = file_encoding['encoding'] reader._encoding = file_encodings sents = reader.sents() train_and_save(sents, output_file, options)
def tryConvert(self): try: encoding = self.detect(self.originalData) #not luck, use chartset if not encoding: encoding = chardet.detect(self.originalData)['encoding'] if not encoding: print "Get encoding false, set utf-8 default" encoding = "utf-8" self.originalEncoding = encoding return self.toUnicode(self.originalData, encoding) except: #traceback.print_exc() self.originalEncoding = None return ""
def delspace(str): encoding = chardet.detect(str) if encoding['encoding']: str = unicode(str, encoding['encoding']) i = 0 while True: if i >= len(str): break if str[i] == u' ' or str[i] == u'\n' or str[i] == u'\r' or str[i] == u' ' or str[i] == u'\t' or str[i] == u'\0': if i == 0 or str[i-1] == u' ': str = str[0:i] + str[i+1:] continue if i < len(str): i += 1 str = str.encode('utf-8', 'ignore') return str
def docsplit(doc): encoding = chardet.detect(doc) if encoding['encoding']: doc = unicode(doc, encoding['encoding']) else: doc = unicode(doc, 'utf-8') doclist = doc.split(u'.') def sub(doc_list, ch): _list = [] for str in doc_list: _list.extend(str.split(ch)) return _list for ch in [u',', u';', u'?', u'?']: doclist = sub(doclist, ch) words = [] for str in doclist: words.append(str.encode('utf-8', 'ignore')) return words
def splitbykeyworks(str): encoding = chardet.detect(str) if encoding['encoding']: str = unicode(str, encoding['encoding']) else: str = unicode(str, 'utf-8') strlist = [str] for word in keykorks: words = [] for str in strlist: r = str.split(word) for s in r: if s != u"": words.append(s) else: words.append(word) strlist = words words = [] for str in strlist: words.append(str.encode('utf-8', 'ignore')) return words
def splitbyclassifier(str): encoding = chardet.detect(str) if encoding['encoding']: str = unicode(str, encoding['encoding']) else: str = unicode(str, 'utf-8') words = [] tmp = u"" old = u"" for ch in str: tmp += ch if ch in classifier: if old in numlist: for i in range(len(tmp)-2, 0, -1): if tmp[i] not in numlist: words.append(tmp[0:i+1]) words.append(tmp[i+1:]) tmp = "" break old = ch if tmp != u"": words.append(tmp.encode('utf-8', 'ignore')) return words
def check(headers, html): charset = '' if headers and headers.has_key('content-type'): ct = headers['content-type'].lower() i = ct.find('charset=') if i != -1: charset = ct[i + len('charset='):].split(';')[0] if html and not charset: ct = meta_re.search(html) if ct: charset = ct.group(1) if html and not charset: lines = html.split('\n') for i in [10, 50, 120]: charset = chardet.detect('\n'.join(lines[:i]))['encoding'] if charset and charset.lower() != 'ascii': break if charset == None: charset = '' return charset.lower()
def check(headers,html): charset = '' if headers and headers.has_key('content-type'): ct = headers['content-type'].lower() i = ct.find('charset=') if i != -1: charset = ct[i+len('charset='):].split(';')[0] if html and not charset: ct = meta_re.search(html) if ct: charset = ct.group(1) if html and not charset: lines = html.split('\n') for i in [10, 50, 120]: charset = chardet.detect('\n'.join(lines[:i]))['encoding'] if charset and charset.lower() != 'ascii': break if charset == None: charset = '' return charset.lower()
def __url_title(url): title = url print url try: res = requests.get(url,timeout=1).content char = str(chardet.detect(res)) if re.search("encoding': 'GB.*",char): res = unicode(res, 'gbk') res.encode('utf8') t = re.search('<title>(.*?)</title>',res) if t: title = t.group(1) title = title.encode('utf8') else: title = url else: t = re.search('<title>(.*?)</title>',res) if t: title = t.group(1) else: title = url return title except: #print traceback.format_exc() return title
def get_html(url): html = '' for i in range(1000): if html and chardet.detect(html)['encoding'] == 'utf-8' and html.find('???JavaScript?????.') == -1: break try: # req_header = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', # 'Accept':'text/html;q=0.9,*/*;q=0.8', # 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3', # 'Connection':'close',} # req_timeout = 10 # req = urllib2.Request(url, None, req_header) # resp = urllib2.urlopen(req, timeout = 10) resp = urllib2.urlopen(url, timeout = 10) html = resp.read() resp.close() except socket.timeout, e: pass except Exception, e: pass # print html[0:100] # print chardet.detect(html) return str(html)
def search_keyword_infile(file_path, word, case=True): """ ???????????????, ????????????????? :param file_path: ????, ?? './compare_key.py' :param word: "main" :param case: True or False, True ??????? :return: ???????????, ???? None ?????, ? None or "int main()" """ # word ?? word = word.lower() if case is True else word with open(file_path, "rb") as f: data = f.read() encoding = chardet.detect(data)["encoding"] try: data = data.decode("utf8") except UnicodeDecodeError: try: data = data.decode("gbk") except UnicodeDecodeError: data = data.decode(encoding) return get_keyword(word, data, case)
def __init__(self, markup, overrideEncodings=[], smartQuotesTo='xml', isHTML=False): self.declaredHTMLEncoding = None self.markup, documentEncoding, sniffedEncoding = \ self._detectEncoding(markup, isHTML) self.smartQuotesTo = smartQuotesTo self.triedEncodings = [] if markup == '' or isinstance(markup, unicode): self.originalEncoding = None self.unicode = unicode(markup) return u = None for proposedEncoding in overrideEncodings: u = self._convertFrom(proposedEncoding) if u: break if not u: for proposedEncoding in (documentEncoding, sniffedEncoding): u = self._convertFrom(proposedEncoding) if u: break # If no luck and we have auto-detection library, try that: if not u and chardet and not isinstance(self.markup, unicode): u = self._convertFrom(chardet.detect(self.markup)['encoding']) # As a last resort, try utf-8 and windows-1252: if not u: for proposed_encoding in ("utf-8", "windows-1252"): u = self._convertFrom(proposed_encoding) if u: break self.unicode = u if not u: self.originalEncoding = None
def _ExtractFileEntries( self, file_system, file_entry, parent_full_path, depth): try: if isinstance(file_entry.name, six.text_type): name = file_entry.name else: enc = chardet.detect(file_entry.name).get('encoding') name = file_entry.name.decode(enc) full_path = file_system.JoinPath([parent_full_path, name]) except Exception: logging.exception("could not extract file entries: %s", name) return if depth < 5: key = parent_full_path.replace('.', '\x00') if key not in self.parent.meta['directories']: self.parent.meta['directories'][key] = 0 self.parent.meta['directories'][key] += 1 if self.accept(full_path, depth): for data_stream in file_entry.data_streams: self.callback(file_entry, full_path, data_stream.name, self.parent) for sub_file_entry in file_entry.sub_file_entries: self._ExtractFileEntries( file_system, sub_file_entry, full_path, depth + 1)
def chardet_dammit(s): return cchardet.detect(s)['encoding']
def chardet_dammit(s): return chardet.detect(s)['encoding'] #import chardet.constants #chardet.constants._debug = 1
def smartCode(self,item): codedetect = chardet.detect(item)["encoding"] try: item = unicode(item, codedetect) return item.encode("utf-8") except: item = unicode(item, 'gbk') return item.encode("utf-8")
def adjust_encoding(data): """detect and adjust encoding of data return data decoded to utf-8""" check_encoding = chardet.detect(data) if 'utf-8' not in check_encoding['encoding']: try: data = data.decode(check_encoding['encoding']).encode('utf-8') except: pass try: data = data.decode('utf-8') except: data = data.decode('utf-8', 'ignore') return {'encoding': check_encoding['encoding'], 'data': data}
def __init__(self, markup, overrideEncodings=[], smartQuotesTo='xml'): self.markup, documentEncoding, sniffedEncoding = \ self._detectEncoding(markup) self.smartQuotesTo = smartQuotesTo self.triedEncodings = [] if isinstance(markup, unicode): return markup u = None for proposedEncoding in overrideEncodings: u = self._convertFrom(proposedEncoding) if u: break if not u: for proposedEncoding in (documentEncoding, sniffedEncoding): u = self._convertFrom(proposedEncoding) if u: break # If no luck and we have auto-detection library, try that: if not u and chardet and not isinstance(self.markup, unicode): u = self._convertFrom(chardet.detect(self.markup)['encoding']) # As a last resort, try utf-8 and windows-1252: if not u: for proposed_encoding in ("utf-8", "windows-1252"): u = self._convertFrom(proposed_encoding) if u: break self.unicode = u if not u: self.originalEncoding = None
def dataReceived(self, data): stateTable = self._build_state_table() self.encoding = self.encoding or detect(data)['encoding'] self.check_encoding(data) self.state = self.state or 'begin' content = decode(data, self.encoding) # bring state, lineno, colno into local scope lineno, colno = self.lineno, self.colno curState = self.state # replace saveMark with a nested scope function saveMark = lambda: (lineno, colno) self.saveMark, _saveMark = saveMark, self.saveMark # fetch functions from the stateTable beginFn, doFn, endFn = stateTable[curState] try: for char in content: # do newline stuff if char == '\n': lineno += 1 colno = 0 else: colno += 1 newState = doFn(char) if newState and newState != curState: # this is the endFn from the previous state endFn() curState = newState beginFn, doFn, endFn = stateTable[curState] beginFn(char) finally: self.saveMark = _saveMark self.lineno, self.colno = lineno, colno # state doesn't make sense if there's an exception.. self.state = curState
def popContent(self, tag): value = self.pop(tag) self.incontent -= 1 self.contentparams.clear() return value # a number of elements in a number of RSS variants are nominally plain # text, but this is routinely ignored. This is an attempt to detect # the most common cases. As false positives often result in silent # data loss, this function errs on the conservative side.