我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用htmlentitydefs.name2codepoint()。
def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x
def handle_entityref(self, ref): """Handle entity references as data, possibly converting known HTML entity references to the corresponding Unicode characters.""" replaceWithXMLEntity = self.convertXMLEntities and \ self.XML_ENTITIES_TO_CHARS.has_key(ref) if self.convertHTMLEntities or replaceWithXMLEntity: try: data = unichr(name2codepoint[ref]) except KeyError: if replaceWithXMLEntity: data = self.XML_ENTITIES_TO_CHARS.get(ref) else: data="&%s" % ref else: data = '&%s;' % ref self.handle_data(data)
def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' if not self.elementstack: return if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): text = '&%s;' % ref elif ref in self.entities: text = self.entities[ref] if text.startswith('&#') and text.endswith(';'): return self.handle_entityref(text) else: try: name2codepoint[ref] except KeyError: text = '&%s;' % ref else: text = unichr(name2codepoint[ref]).encode('utf-8') self.elementstack[-1][2].append(text)
def _html_unescape(self, str): def entity_replacer(m): entity = m.group(1) if entity in name2codepoint: return unichr(name2codepoint[entity]) else: return m.group(0) def ascii_replacer(m): cp = int(m.group(1)) if cp <= 255: return unichr(cp) else: return m.group(0) s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U) return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
def fix_text(text): def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text)
def decodeHtmlentities(string): string = entitiesfix(string) entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});") def substitute_entity(match): from htmlentitydefs import name2codepoint as n2cp ent = match.group(2) if match.group(1) == "#": return unichr(int(ent)).encode('utf-8') else: cp = n2cp.get(ent) if cp: return unichr(cp).encode('utf-8') else: return match.group() return entity_re.subn(substitute_entity, string)[0]
def unescape(text): ''' Removes HTML or XML character references and entities from a text string. @param text The HTML (or XML) source text. @return The plain text, as a Unicode string, if necessary. ''' def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text)
def get_entitydefs(): import htmlentitydefs from codecs import latin_1_decode entitydefs = {} try: htmlentitydefs.name2codepoint except AttributeError: entitydefs = {} for name, char in htmlentitydefs.entitydefs.items(): uc = latin_1_decode(char)[0] if uc.startswith("&#") and uc.endswith(";"): uc = unescape_charref(uc[2:-1], None) entitydefs["&%s;" % name] = uc else: for name, codepoint in htmlentitydefs.name2codepoint.items(): entitydefs["&%s;" % name] = unichr(codepoint) return entitydefs
def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: name = text[1:-1] text = unichr(name2codepoint[name]) except KeyError: pass return text # leave as is
def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' if not self.elementstack: return if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref) if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): text = '&%s;' % ref else: # entity resolution graciously donated by Aaron Swartz def name2cp(k): import htmlentitydefs if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3 return htmlentitydefs.name2codepoint[k] k = htmlentitydefs.entitydefs[k] if k.startswith('&#') and k.endswith(';'): return int(k[2:-1]) # not in latin-1 return ord(k) try: name2cp(ref) except KeyError: text = '&%s;' % ref else: text = unichr(name2cp(ref)).encode('utf-8') self.elementstack[-1][2].append(text)
def remove_HTML_XML_char(text): """ Removes HTML or XML character references and entities from a text string. :param text The HTML (or XML) source text. :return The plain text, as a Unicode string, if necessary. Code from 'https://github.com/attardi/wikiextractor' """ def fixup(m): text = m.group(0) code = m.group(1) try: if text[1] == "#": # character reference if text[2] == "x": return unichr(int(code[1:], 16)) else: return unichr(int(code)) else: # named entity return unichr(name2codepoint[code]) except: return text # leave as is return re.sub("&#?(\w+);", fixup, text)
def _replace_entity(match): text = match.group(1) if text[0] == u'#': text = text[1:] try: if text[0] in u'xX': c = int(text[1:], 16) else: c = int(text) return unichr(c) except ValueError: return match.group(0) else: try: return unichr(name2codepoint[text]) except (ValueError, KeyError): return match.group(0)
def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' if not self.elementstack: return if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): text = '&%s;' % ref elif ref in self.entities: text = self.entities[ref] if text.startswith('&#') and text.endswith(';'): return self.handle_entityref(text) else: try: name2codepoint[ref] except KeyError: text = '&%s;' % ref else: text = chr(name2codepoint[ref]).encode('utf-8') self.elementstack[-1][2].append(text)
def unescape(text): def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text)
def unescape(text): def fixup(m): text = m.group(0) code = m.group(1) try: if text[1] == "#": # character reference if text[2] == "x": return unichr(int(code[1:], 16)) else: return unichr(int(code)) else: # named entity return unichr(name2codepoint[code]) except: return text # leave as is return re.sub("&#?(\w+);", fixup, text) # Match HTML comments # The buggy template {{Template:T}} has a comment terminating with just "->"