Python chardet 模块,detect() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用chardet.detect()

项目:BookCloud    作者:livro-aberto    | 项目源码 | 文件源码
def force_unicode(s):
    """Do all kinds of magic to turn `s` into unicode"""
    # It's already unicode, don't do anything:
    #if isinstance(s, six.text_type):
    #    return s
    # Try some default encodings:
    try:
        return s.decode('utf-8')
    except UnicodeDecodeError as exc:
        pass
    try:
        return s.decode(locale.getpreferredencoding())
    except:
        return (_('Unicode conversion error'))
    if chardet is not None:
        # Try chardet, if available
        encoding = chardet.detect(s)['encoding']
        if encoding is not None:
            return s.decode(encoding)
    raise # Give up.
项目:nstock    作者:ybenitezf    | 项目源码 | 文件源码
def skipwrap(para):
    # If the text begins with four spaces or one tab, it's a code block; don't wrap
    if para[0:4] == '    ' or para[0] == '\t':
        return True
    # If the text begins with only two "--", possibly preceded by whitespace, that's
    # an emdash; so wrap.
    stripped = para.lstrip()
    if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
        return False
    # I'm not sure what this is for; I thought it was to detect lists, but there's
    # a <br>-inside-<span> case in one of the tests that also depends upon it.
    if stripped[0:1] == '-' or stripped[0:1] == '*':
        return True
    # If the text begins with a single -, *, or +, followed by a space, or an integer,
    # followed by a ., followed by a space (in either case optionally preceeded by
    # whitespace), it's a list; don't wrap.
    if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped):
        return True
    return False
项目:gfw_domain_whitelist_spider    作者:R0uter    | 项目源码 | 文件源码
def __getPage(self,url):
        http = urllib3.PoolManager(
            cert_reqs='CERT_REQUIRED',  # Force certificate check.
            ca_certs=certifi.where(),  # Path to the Certifi bundle.
        )
        data = ''
        try:
            data = http.request('GET', url, timeout=10,
                                headers={
                                    'User-agent' : 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5'}
                                ).data

            codeType = chardet.detect(data)
            data = data.decode(codeType['encoding'])
        except:
            pass

        return data
项目:base_function    作者:Rockyzsu    | 项目源码 | 文件源码
def debug_page():
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0'
    }
    url = 'http://m.qfang.com/guangzhou/rent/100001468?gardenId=1109818'
    r = requests.get(url=url, headers=headers)
    #r.encoding='gbk'
    print r.status_code
    print type(r.content)
    print r.content
    #print chardet.detect(r)
    tree = etree.HTML(r.text,parser=etree.HTMLParser(encoding='utf-8'))
    #print etree.tostring(tree)
    return tree,r.text

# ????????header??
项目:MyPythonLib    作者:BillWang139967    | 项目源码 | 文件源码
def strtodecode(s=''):          #????????unicode
    if not s:
        return False

    tempstr = s
    try:
        chardetdict = chardet.detect(tempstr)
    except:
        pass
    else:
        try:
            if tempstr.decode('utf-8','ignore').encode('utf-8') == tempstr:
                templabelstrdecode = tempstr.decode('utf-8','ignore').encode('utf-8').decode('utf-8')
            elif tempstr.decode('gbk','ignore').encode('gbk') == tempstr:
                templabelstrdecode = tempstr.decode('gbk','ignore').encode('utf-8').decode('utf-8')
            else:
                templabelstrdecode = tempstr.decode(chardetdict['encoding'],'ignore').encode('utf-8').decode('utf-8')
        except:
            pass
        else:
            tempstr = templabelstrdecode

    return tempstr
项目:HtmlExtract-Python    作者:xinyi-spark    | 项目源码 | 文件源码
def extract_meta(html):
    '''
    ????meta???????????
    '''
    if chardet.detect(html)['encoding'] == 'utf-8':
        html = html.decode('utf-8')
    meta_list = []
    # ??html?meta???
    page = etree.HTML(html.lower())
    xpath_result = page.xpath(u"//meta/@content")
    for once_xpath_result in xpath_result:
        # ???????????
        if zh_check(once_xpath_result) == True:
            meta_list.append(utf8_transfer(once_xpath_result).decode('utf-8'))
    if meta_list != []:
        return meta_list
    else:
        return False
项目:cc-server    作者:curious-containers    | 项目源码 | 文件源码
def validation(schema):
    """function decorator"""
    def dec(func):
        def wrapper(self, *args, **kwargs):
            try:
                rawdata = request.data
                enc = chardet.detect(rawdata)
                data = rawdata.decode(enc['encoding'])
                json_input = json.loads(data)
                jsonschema.validate(json_input, schema)
                json_input = prepare_input(json_input)
            except:
                raise BadRequest('JSON input not valid: {}'.format(format_exc()))
            return func(self, json_input, *args, **kwargs)
        return wrapper
    return dec
项目:py-cloud-compute-cannon    作者:Autodesk    | 项目源码 | 文件源码
def autodecode(b):
    """ Try to decode ``bytes`` to text - try default encoding first, otherwise try to autodetect

    Args:
        b (bytes): byte string

    Returns:
        str: decoded text string
    """
    import warnings
    import chardet

    try:
        return b.decode()
    except UnicodeError:
        result = chardet.detect(b)
        if result['confidence'] < 0.95:
            warnings.warn('autodecode failed with utf-8; guessing %s' % result['encoding'])
        return result.decode(result['encoding'])
项目:mailmerge    作者:awdeorio    | 项目源码 | 文件源码
def parsemail(raw_message):
    """Parse message headers, then remove BCC header."""
    message = email.parser.Parser().parsestr(raw_message)

    # Detect encoding
    detected = chardet.detect(bytearray(raw_message, "utf-8"))
    encoding = detected["encoding"]
    message.set_charset(encoding)
    print(">>> encoding {}".format(encoding))

    # Extract recipients
    addrs = email.utils.getaddresses(message.get_all("TO", [])) + \
        email.utils.getaddresses(message.get_all("CC", [])) + \
        email.utils.getaddresses(message.get_all("BCC", []))
    recipients = [x[1] for x in addrs]
    message.__delitem__("bcc")
    text = message.as_string()
    sender = message["from"]
    return (text, sender, recipients)
项目:yt    作者:yt-project    | 项目源码 | 文件源码
def make_utf8(text, encoding):
    """Convert a text to UTF-8, brute-force."""
    try:
        u = unicode(text, 'utf-8')
        uenc = 'utf-8'
    except UnicodeError:
        try:
            u = unicode(text, encoding)
            uenc = 'utf-8'
        except UnicodeError:
            u = unicode(text, 'iso-8859-15', 'ignore')
            uenc = 'iso-8859-15'
    try:
        import chardet
    except ImportError:
        return u.encode('utf-8')
    d = chardet.detect(text)
    if d['encoding'] == uenc:
        return u.encode('utf-8')
    return unicode(text, d['encoding'], 'ignore').encode('utf-8')
项目:stock    作者:Rockyzsu    | 项目源码 | 文件源码
def caculation(self):
            df_t=ts.get_today_all()
            print self.df[u'??'].values
            for i in self.df[u'??'].values:
                name=self.base[self.base['code']==i]['name'].values[0]
                print name
                t=name.decode('utf-8')
                print
                print type(t)
                #print chardet.detect(t)
                self.df.ix[self.df[u'??']==i,u'????']=self.today
                #t=ts.get_k_data(i)

                pchange=df_t.ix[df_t['code']==i,'changepercent'].values[0]
                print pchange
                self.df.ix[self.df[u'??']==i,u'????']=pchange
                current=df_t[df_t['code']==i]['trade'].values[0]
                self.df.ix[self.df[u'??']==i,u'????']=current
                current_profit=(current-self.df[self.df[u'??']==i][u'????'].values[0])/self.df[self.df[u'??']==i][u'????'].values[0]
                self.df.ix[self.df[u'??']==i,u'????']=round(current_profit*100,2)
                print current_profit
            print self.df
            self.df.to_excel(self.name,encoding='utf-8')
项目:w2vec-similarity    作者:jayantj    | 项目源码 | 文件源码
def read_and_train(root_dir, fileids, output_file='', options={}):
  fileids =  fileids if isinstance(fileids, list) else [fileids]
  fileids = [unicode(f, 'utf8') for f in fileids]
  output_file = output_file or '-'.join(fileids)
  output_file = u"{0}{1}-{2}".format(MODELS_DIR, output_file, options_to_string(options))
  reader = PlaintextCorpusReader(root=root_dir, fileids=fileids)
  try:
    sents = reader.sents()
    print fileids
    train_and_save(sents, output_file, options)
  except UnicodeDecodeError:
    print "here"
    file_encodings = {}
    for fileid in fileids:
      file_content = open(root_dir + fileid).read()
      file_encoding = chardet.detect(file_content)
      file_encodings[fileid] = file_encoding['encoding']
    reader._encoding = file_encodings
    sents = reader.sents()
    train_and_save(sents, output_file, options)
项目:sogaQuant    作者:idoplay    | 项目源码 | 文件源码
def tryConvert(self):
        try:
            encoding = self.detect(self.originalData)
            #not luck, use chartset
            if not encoding:
                encoding = chardet.detect(self.originalData)['encoding']
            if not encoding:
                print "Get encoding false, set utf-8 default"
                encoding = "utf-8"

            self.originalEncoding = encoding
            return self.toUnicode(self.originalData, encoding)
        except:
            #traceback.print_exc()
            self.originalEncoding = None
            return ""
项目:websearch    作者:abelkhan    | 项目源码 | 文件源码
def delspace(str):
    encoding = chardet.detect(str)
    if encoding['encoding']:
        str = unicode(str, encoding['encoding'])

        i = 0
        while True:
            if i >= len(str):
                break

            if str[i] == u' ' or str[i] == u'\n' or str[i] == u'\r' or str[i] == u' ' or str[i] == u'\t' or str[i] == u'\0':
                if i == 0 or str[i-1] == u' ':
                    str = str[0:i] + str[i+1:]
                    continue

            if i < len(str):
                i += 1

        str = str.encode('utf-8', 'ignore')

    return str
项目:websearch    作者:abelkhan    | 项目源码 | 文件源码
def docsplit(doc):
    encoding = chardet.detect(doc)
    if encoding['encoding']:
        doc = unicode(doc, encoding['encoding'])
    else:
        doc = unicode(doc, 'utf-8')

    doclist = doc.split(u'.')

    def sub(doc_list, ch):
        _list = []
        for str in doc_list:
            _list.extend(str.split(ch))
        return _list

    for ch in [u',', u';', u'?', u'?']:
        doclist = sub(doclist, ch)

    words = []
    for str in doclist:
        words.append(str.encode('utf-8', 'ignore'))

    return words
项目:websearch    作者:abelkhan    | 项目源码 | 文件源码
def splitbykeyworks(str):
    encoding = chardet.detect(str)
    if encoding['encoding']:
        str = unicode(str, encoding['encoding'])
    else:
        str = unicode(str, 'utf-8')

    strlist = [str]
    for word in keykorks:
        words = []
        for str in strlist:
            r = str.split(word)
            for s in r:
                if s != u"":
                    words.append(s)
                else:
                    words.append(word)
        strlist = words

    words = []
    for str in strlist:
        words.append(str.encode('utf-8', 'ignore'))

    return words
项目:websearch    作者:abelkhan    | 项目源码 | 文件源码
def splitbyclassifier(str):
    encoding = chardet.detect(str)
    if encoding['encoding']:
        str = unicode(str, encoding['encoding'])
    else:
        str = unicode(str, 'utf-8')

    words = []

    tmp = u""
    old = u""
    for ch in str:
        tmp += ch
        if ch in classifier:
            if old in numlist:
                for i in range(len(tmp)-2, 0, -1):
                    if tmp[i] not in numlist:
                        words.append(tmp[0:i+1])
                        words.append(tmp[i+1:])
                        tmp = ""
                        break
        old = ch
    if tmp != u"":
        words.append(tmp.encode('utf-8', 'ignore'))
    return words
项目:PocHunter    作者:DavexPro    | 项目源码 | 文件源码
def check(headers, html):
    charset = ''

    if headers and headers.has_key('content-type'):
        ct = headers['content-type'].lower()
        i = ct.find('charset=')
        if i != -1:
            charset = ct[i + len('charset='):].split(';')[0]

    if html and not charset:
        ct = meta_re.search(html)
        if ct:
            charset = ct.group(1)

    if html and not charset:
        lines = html.split('\n')
        for i in [10, 50, 120]:
            charset = chardet.detect('\n'.join(lines[:i]))['encoding']
            if charset and charset.lower() != 'ascii':
                break

    if charset == None:
        charset = ''
    return charset.lower()
项目:PocHunter    作者:DavexPro    | 项目源码 | 文件源码
def check(headers,html):
    charset = ''

    if headers and headers.has_key('content-type'):
        ct = headers['content-type'].lower()
        i = ct.find('charset=')
        if i != -1:
            charset = ct[i+len('charset='):].split(';')[0]

    if html and not charset:
        ct = meta_re.search(html)
        if ct:
            charset = ct.group(1)

    if html and not charset:
        lines = html.split('\n')
        for i in [10, 50, 120]:
            charset = chardet.detect('\n'.join(lines[:i]))['encoding']
            if charset and charset.lower() != 'ascii':
                break

    if charset == None:
        charset = ''
    return charset.lower()
项目:pocscan    作者:erevus-cn    | 项目源码 | 文件源码
def check(headers,html):
    charset = ''

    if headers and headers.has_key('content-type'):
        ct = headers['content-type'].lower()
        i = ct.find('charset=')
        if i != -1:
            charset = ct[i+len('charset='):].split(';')[0]

    if html and not charset:
        ct = meta_re.search(html)
        if ct:
            charset = ct.group(1)

    if html and not charset:
        lines = html.split('\n')
        for i in [10, 50, 120]:
            charset = chardet.detect('\n'.join(lines[:i]))['encoding']
            if charset and charset.lower() != 'ascii':
                break

    if charset == None:
        charset = ''
    return charset.lower()
项目:pocscan    作者:erevus-cn    | 项目源码 | 文件源码
def check(headers,html):
    charset = ''

    if headers and headers.has_key('content-type'):
        ct = headers['content-type'].lower()
        i = ct.find('charset=')
        if i != -1:
            charset = ct[i+len('charset='):].split(';')[0]

    if html and not charset:
        ct = meta_re.search(html)
        if ct:
            charset = ct.group(1)

    if html and not charset:
        lines = html.split('\n')
        for i in [10, 50, 120]:
            charset = chardet.detect('\n'.join(lines[:i]))['encoding']
            if charset and charset.lower() != 'ascii':
                break

    if charset == None:
        charset = ''
    return charset.lower()
项目:AnyScan    作者:zhangzhenfeng    | 项目源码 | 文件源码
def check(headers,html):
    charset = ''

    if headers and headers.has_key('content-type'):
        ct = headers['content-type'].lower()
        i = ct.find('charset=')
        if i != -1:
            charset = ct[i+len('charset='):].split(';')[0]

    if html and not charset:
        ct = meta_re.search(html)
        if ct:
            charset = ct.group(1)

    if html and not charset:
        lines = html.split('\n')
        for i in [10, 50, 120]:
            charset = chardet.detect('\n'.join(lines[:i]))['encoding']
            if charset and charset.lower() != 'ascii':
                break

    if charset == None:
        charset = ''
    return charset.lower()
项目:AnyScan    作者:zhangzhenfeng    | 项目源码 | 文件源码
def __url_title(url):
    title = url
    print url
    try:
        res = requests.get(url,timeout=1).content
        char = str(chardet.detect(res))
        if re.search("encoding': 'GB.*",char):
            res = unicode(res, 'gbk')
            res.encode('utf8')
            t = re.search('<title>(.*?)</title>',res)
            if t:
                title = t.group(1)
                title = title.encode('utf8')
            else:
                title = url
        else:
            t = re.search('<title>(.*?)</title>',res)
            if t:
                title = t.group(1)
            else:
                title = url
        return title
    except:
        #print traceback.format_exc()
        return title
项目:proxy_ip    作者:hereisok    | 项目源码 | 文件源码
def get_html(url):
    html = ''
    for i in range(1000):
        if html and chardet.detect(html)['encoding'] == 'utf-8' and html.find('???JavaScript?????.') == -1:
            break
        try:
            # req_header = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
            # 'Accept':'text/html;q=0.9,*/*;q=0.8',
            # 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            # 'Connection':'close',}
            # req_timeout = 10
            # req = urllib2.Request(url, None, req_header)
            # resp = urllib2.urlopen(req, timeout = 10)
            resp = urllib2.urlopen(url, timeout = 10)
            html = resp.read()
            resp.close()
        except socket.timeout, e:
            pass
        except Exception, e:
            pass


    # print html[0:100]
    # print chardet.detect(html)
    return str(html)
项目:Mac-Python-3.X    作者:L1nwatch    | 项目源码 | 文件源码
def search_keyword_infile(file_path, word, case=True):
    """
    ???????????????, ?????????????????
    :param file_path: ????, ?? './compare_key.py'
    :param word: "main"
    :param case: True or False, True ???????
    :return: ???????????, ???? None ?????, ? None or "int main()"
    """
    # word ??
    word = word.lower() if case is True else word

    with open(file_path, "rb") as f:
        data = f.read()
        encoding = chardet.detect(data)["encoding"]

    try:
        data = data.decode("utf8")
    except UnicodeDecodeError:
        try:
            data = data.decode("gbk")
        except UnicodeDecodeError:
            data = data.decode(encoding)

    return get_keyword(word, data, case)
项目:kekescan    作者:xiaoxiaoleo    | 项目源码 | 文件源码
def check(headers,html):
    charset = ''

    if headers and headers.has_key('content-type'):
        ct = headers['content-type'].lower()
        i = ct.find('charset=')
        if i != -1:
            charset = ct[i+len('charset='):].split(';')[0]

    if html and not charset:
        ct = meta_re.search(html)
        if ct:
            charset = ct.group(1)

    if html and not charset:
        lines = html.split('\n')
        for i in [10, 50, 120]:
            charset = chardet.detect('\n'.join(lines[:i]))['encoding']
            if charset and charset.lower() != 'ascii':
                break

    if charset == None:
        charset = ''
    return charset.lower()
项目:kekescan    作者:xiaoxiaoleo    | 项目源码 | 文件源码
def check(headers,html):
    charset = ''

    if headers and headers.has_key('content-type'):
        ct = headers['content-type'].lower()
        i = ct.find('charset=')
        if i != -1:
            charset = ct[i+len('charset='):].split(';')[0]

    if html and not charset:
        ct = meta_re.search(html)
        if ct:
            charset = ct.group(1)

    if html and not charset:
        lines = html.split('\n')
        for i in [10, 50, 120]:
            charset = chardet.detect('\n'.join(lines[:i]))['encoding']
            if charset and charset.lower() != 'ascii':
                break

    if charset == None:
        charset = ''
    return charset.lower()
项目:Bahubali---DDOS-Toolkit    作者:navanchauhan    | 项目源码 | 文件源码
def __init__(self, markup, overrideEncodings=[],
                 smartQuotesTo='xml', isHTML=False):
        self.declaredHTMLEncoding = None
        self.markup, documentEncoding, sniffedEncoding = \
                     self._detectEncoding(markup, isHTML)
        self.smartQuotesTo = smartQuotesTo
        self.triedEncodings = []
        if markup == '' or isinstance(markup, unicode):
            self.originalEncoding = None
            self.unicode = unicode(markup)
            return

        u = None
        for proposedEncoding in overrideEncodings:
            u = self._convertFrom(proposedEncoding)
            if u: break
        if not u:
            for proposedEncoding in (documentEncoding, sniffedEncoding):
                u = self._convertFrom(proposedEncoding)
                if u: break

        # If no luck and we have auto-detection library, try that:
        if not u and chardet and not isinstance(self.markup, unicode):
            u = self._convertFrom(chardet.detect(self.markup)['encoding'])

        # As a last resort, try utf-8 and windows-1252:
        if not u:
            for proposed_encoding in ("utf-8", "windows-1252"):
                u = self._convertFrom(proposed_encoding)
                if u: break

        self.unicode = u
        if not u: self.originalEncoding = None
项目:gransk    作者:pcbje    | 项目源码 | 文件源码
def _ExtractFileEntries(
          self, file_system, file_entry, parent_full_path, depth):
    try:
      if isinstance(file_entry.name, six.text_type):
        name = file_entry.name
      else:
        enc = chardet.detect(file_entry.name).get('encoding')
        name = file_entry.name.decode(enc)

      full_path = file_system.JoinPath([parent_full_path, name])
    except Exception:
      logging.exception("could not extract file entries: %s", name)
      return

    if depth < 5:
      key = parent_full_path.replace('.', '\x00')
      if key not in self.parent.meta['directories']:
        self.parent.meta['directories'][key] = 0

      self.parent.meta['directories'][key] += 1

    if self.accept(full_path, depth):
      for data_stream in file_entry.data_streams:
        self.callback(file_entry, full_path, data_stream.name, self.parent)

      for sub_file_entry in file_entry.sub_file_entries:
        self._ExtractFileEntries(
            file_system, sub_file_entry, full_path, depth + 1)
项目:Projects    作者:it2school    | 项目源码 | 文件源码
def chardet_dammit(s):
        return cchardet.detect(s)['encoding']
项目:Projects    作者:it2school    | 项目源码 | 文件源码
def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1
项目:TACTIC-Handler    作者:listyque    | 项目源码 | 文件源码
def chardet_dammit(s):
        return cchardet.detect(s)['encoding']
项目:TACTIC-Handler    作者:listyque    | 项目源码 | 文件源码
def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1
项目:SharesData    作者:xjkj123    | 项目源码 | 文件源码
def smartCode(self,item):
        codedetect = chardet.detect(item)["encoding"]
        try:
            item = unicode(item, codedetect)
            return item.encode("utf-8")
        except:
            item = unicode(item, 'gbk')
            return item.encode("utf-8")
项目:SerpScrap    作者:ecoron    | 项目源码 | 文件源码
def adjust_encoding(data):
        """detect and adjust encoding of data return data decoded to utf-8"""
        check_encoding = chardet.detect(data)
        if 'utf-8' not in check_encoding['encoding']:
            try:
                data = data.decode(check_encoding['encoding']).encode('utf-8')
            except:
                pass
        try:
            data = data.decode('utf-8')
        except:
            data = data.decode('utf-8', 'ignore')
        return {'encoding': check_encoding['encoding'], 'data': data}
项目:darkc0de-old-stuff    作者:tuwid    | 项目源码 | 文件源码
def __init__(self, markup, overrideEncodings=[],
                 smartQuotesTo='xml'):
        self.markup, documentEncoding, sniffedEncoding = \
                     self._detectEncoding(markup)
        self.smartQuotesTo = smartQuotesTo
        self.triedEncodings = []
        if isinstance(markup, unicode):
            return markup

        u = None
        for proposedEncoding in overrideEncodings:
            u = self._convertFrom(proposedEncoding)
            if u: break
        if not u:
            for proposedEncoding in (documentEncoding, sniffedEncoding):
                u = self._convertFrom(proposedEncoding)
                if u: break

        # If no luck and we have auto-detection library, try that:
        if not u and chardet and not isinstance(self.markup, unicode):
            u = self._convertFrom(chardet.detect(self.markup)['encoding'])

        # As a last resort, try utf-8 and windows-1252:
        if not u:
            for proposed_encoding in ("utf-8", "windows-1252"):
                u = self._convertFrom(proposed_encoding)
                if u: break
        self.unicode = u
        if not u: self.originalEncoding = None
项目:UPBGE-CommunityAddon    作者:elmeunick9    | 项目源码 | 文件源码
def chardet_dammit(s):
        return cchardet.detect(s)['encoding']
项目:UPBGE-CommunityAddon    作者:elmeunick9    | 项目源码 | 文件源码
def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1
项目:llk    作者:Tycx2ry    | 项目源码 | 文件源码
def chardet_dammit(s):
        return cchardet.detect(s)['encoding']
项目:llk    作者:Tycx2ry    | 项目源码 | 文件源码
def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1
项目:purelove    作者:hucmosin    | 项目源码 | 文件源码
def __init__(self, markup, overrideEncodings=[],
                 smartQuotesTo='xml', isHTML=False):
        self.declaredHTMLEncoding = None
        self.markup, documentEncoding, sniffedEncoding = \
                     self._detectEncoding(markup, isHTML)
        self.smartQuotesTo = smartQuotesTo
        self.triedEncodings = []
        if markup == '' or isinstance(markup, unicode):
            self.originalEncoding = None
            self.unicode = unicode(markup)
            return

        u = None
        for proposedEncoding in overrideEncodings:
            u = self._convertFrom(proposedEncoding)
            if u: break
        if not u:
            for proposedEncoding in (documentEncoding, sniffedEncoding):
                u = self._convertFrom(proposedEncoding)
                if u: break

        # If no luck and we have auto-detection library, try that:
        if not u and chardet and not isinstance(self.markup, unicode):
            u = self._convertFrom(chardet.detect(self.markup)['encoding'])

        # As a last resort, try utf-8 and windows-1252:
        if not u:
            for proposed_encoding in ("utf-8", "windows-1252"):
                u = self._convertFrom(proposed_encoding)
                if u: break

        self.unicode = u
        if not u: self.originalEncoding = None
项目:harbour-sailfinder    作者:DylanVanAssche    | 项目源码 | 文件源码
def chardet_dammit(s):
        return cchardet.detect(s)['encoding']
项目:harbour-sailfinder    作者:DylanVanAssche    | 项目源码 | 文件源码
def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1
项目:harbour-sailfinder    作者:DylanVanAssche    | 项目源码 | 文件源码
def chardet_dammit(s):
        return cchardet.detect(s)['encoding']
项目:harbour-sailfinder    作者:DylanVanAssche    | 项目源码 | 文件源码
def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1
项目:riko    作者:nerevu    | 项目源码 | 文件源码
def dataReceived(self, data):
        stateTable = self._build_state_table()
        self.encoding = self.encoding or detect(data)['encoding']
        self.check_encoding(data)
        self.state = self.state or 'begin'
        content = decode(data, self.encoding)

        # bring state, lineno, colno into local scope
        lineno, colno = self.lineno, self.colno
        curState = self.state

        # replace saveMark with a nested scope function
        saveMark = lambda: (lineno, colno)
        self.saveMark, _saveMark = saveMark, self.saveMark

        # fetch functions from the stateTable
        beginFn, doFn, endFn = stateTable[curState]

        try:
            for char in content:
                # do newline stuff
                if char == '\n':
                    lineno += 1
                    colno = 0
                else:
                    colno += 1

                newState = doFn(char)

                if newState and newState != curState:
                    # this is the endFn from the previous state
                    endFn()
                    curState = newState
                    beginFn, doFn, endFn = stateTable[curState]
                    beginFn(char)
        finally:
            self.saveMark = _saveMark
            self.lineno, self.colno = lineno, colno

        # state doesn't make sense if there's an exception..
        self.state = curState
项目:B.E.N.J.I.    作者:the-ethan-hunt    | 项目源码 | 文件源码
def chardet_dammit(s):
        return cchardet.detect(s)['encoding']
项目:B.E.N.J.I.    作者:the-ethan-hunt    | 项目源码 | 文件源码
def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1
项目:SublimeRSS    作者:JaredMHall    | 项目源码 | 文件源码
def popContent(self, tag):
        value = self.pop(tag)
        self.incontent -= 1
        self.contentparams.clear()
        return value

    # a number of elements in a number of RSS variants are nominally plain
    # text, but this is routinely ignored.  This is an attempt to detect
    # the most common cases.  As false positives often result in silent
    # data loss, this function errs on the conservative side.
项目:touch-pay-client    作者:HackPucBemobi    | 项目源码 | 文件源码
def popContent(self, tag):
        value = self.pop(tag)
        self.incontent -= 1
        self.contentparams.clear()
        return value

    # a number of elements in a number of RSS variants are nominally plain
    # text, but this is routinely ignored.  This is an attempt to detect
    # the most common cases.  As false positives often result in silent
    # data loss, this function errs on the conservative side.