Python html2text 模块,HTML2Text() 实例源码

我们从Python开源项目中,提取了以下29个代码示例,用于说明如何使用html2text.HTML2Text()

项目:MercrediFiction    作者:Meewan    | 项目源码 | 文件源码
def to_text(html, rehtml=False):
    parser = HTML2Text()
    parser.wrap_links = False
    parser.skip_internal_links = True
    parser.inline_links = True
    parser.ignore_anchors = True
    parser.ignore_images = True
    parser.ignore_emphasis = True
    parser.ignore_links = True
    text = parser.handle(html)
    text = text.strip(' \t\n\r')
    if rehtml:
        text = text.replace('\n', '<br/>')
        text = text.replace('\\', '')
    return text
项目:epuap-watchdog    作者:ad-m    | 项目源码 | 文件源码
def get_queryset(self, krs, nip, regon, google, no_regon, no_nip):
        regon = regon or []
        nip = nip or []

        self.processor = html2text.HTML2Text()
        self.processor.ignore_emphasis = True
        self.processor.bypass_tables = True
        self.processor.ignore_links = True

        self.session = requests.Session()
        for keyword in tqdm(google or []):
            if not no_regon:
                result = self.search_google("{} REGON".format(keyword), REGON_PATTERN)
                print("For '{}' found {}".format(keyword, result))
                regon += result
            if not no_nip:
                result = self.search_google("{} NIP".format(keyword), NIP_PATTERN)
                print("For '{}' found {}".format(keyword, result))
                nip += [x.replace('-', '') for x in result if len(x.replace('-', '')) == 10]
        queries = [{'krs': v} for v in set(krs)] if krs else []
        queries += [{'nip': v} for v in set(nip)] if nip else []
        queries += [{'regon': v} for v in set(regon)] if regon else []

        return queries
项目:aquests    作者:hansroh    | 项目源码 | 文件源码
def set_data (self, resp):
        if not resp.text.strip ():
            self.data = None

        else:   
            ct = resp.headers.get ('content-type')
            if ct is None or ct.find ('text/html') == 0:
                h = html2text.HTML2Text()
                h.ignore_links = True
                text = h.handle(resp.text)
                self.data = text

            elif ct is None or ct.find ('text/') == 0:
                self.data = resp.text.strip ()
            else:
                data = resp.json ()
                if isinstance (data, dict):
                    self.data.update (data)
                else:
                    self.data = data

        if not str(resp.status_code).startswith("2"):           
            raise AssertionError ("%s %s\n%s\n%s" % (resp.status_code, resp.reason, "-" * (20 + len (resp.reason)), self))
项目:cerberus-core    作者:ovh    | 项目源码 | 文件源码
def __init__(self):
        """
        """
        try:
            directory = settings.GENERAL_CONFIG['email_storage_dir']
            if not os.path.exists(directory):
                os.makedirs(directory)
        except Exception as ex:
            raise MailerServiceException(ex)

        self._db_conn = sqlite3.connect(directory + '/' + CERBERUS_EMAIL_DB)
        cursor = self._db_conn.cursor()
        cursor.execute('''CREATE TABLE IF NOT EXISTS emails
                (publicid text, sender text, recipient text, subject text, body text, category text, timestamp int)''')
        self._db_conn.commit()

        self._html_parser = html2text.HTML2Text()
        self._html_parser.body_width = 0
项目:munch-core    作者:crunchmail    | 项目源码 | 文件源码
def mk_plaintext(self):
        try:
            h = html2text.HTML2Text()
            h.ignore_images = True
            h.inline_links = False
            h.wrap_links = False
            h.unicode_snob = True  # Prevents accents removing
            h.skip_internal_links = True
            h.ignore_anchors = True
            h.body_width = 0
            h.use_automatic_links = True
            h.ignore_tables = True
        except html.parser.HTMLParseError as e:
            raise WrongHTML(e)

        return h.handle(self.mk_html())
项目:jianshuHot    作者:jackeyGao    | 项目源码 | 文件源码
def parse_item(self, response):
        title = response.xpath('//h1[@class="title"]/text()').extract()[0]
        body = response.xpath('//div[@class="show-content"]').extract()[0]
        attr = response.xpath('//script[@data-name="note"]/text()').extract()
        images = response.xpath('//div[@class="image-package"]/img/@src').extract()
        notes = json.loads(attr[0].strip())

        # ??markdown ??
        h = html2text.HTML2Text()
        h.ignore_links = False
        h.inline_links = False
        content = h.handle(body)

        item = JianshuItem()
        item["title"] = title
        item["content"] = content.replace('-\n', '-').replace('\n?', '?')
        item["url"] = notes['url']
        item["slug"] = notes['slug']
        item["views_count"] = notes['views_count']
        item["likes_count"] = notes['likes_count']
        item["images"] = images
        yield item
项目:webmon    作者:KarolBedkowski    | 项目源码 | 文件源码
def _load_entry(self, entry, fields, add_content):
        res = list(_get_val_from_rss_entry(entry, fields))
        if add_content:
            content = _get_content_from_rss_entry(entry)
            if content:
                if self._conf["html2text"]:
                    try:
                        import html2text as h2t
                        content = h2t.HTML2Text(bodywidth=74).handle(content)
                    except ImportError:
                        self._ctx.log_error(
                            "RssInput: loading HTML2Text error "
                            "(module not found)")
                res.append("")
                res.extend("    " + line.strip()
                           for line in content.strip().split("\n"))
        self._ctx.log_debug(repr(res))
        return "\n".join(res).strip()
项目:googMeow    作者:aaaddress1    | 项目源码 | 文件源码
def printImportScreen(title, url, src, keyword):
    print '====================================='
    title = (title.encode('utf-8').replace(' ',''))[:50] + '...'
    print 'Found Keyword in the page "%s"' % (title)
    print 'URL: %s' % url
    print '====================================='

    h = html2text.HTML2Text()
    h.ignore_links  = True
    foundKey = False
    listCount = 0

    gg = h.handle(src)
    for i in gg.encode('utf-8').split('\n'):
        if keyword in i: foundKey = True
        if i == '\n': continue
        if foundKey: 
            listCount += 1
            if listCount > 10:
                break
            print i
    return
项目:remotor    作者:jamiebull1    | 项目源码 | 文件源码
def parse(self, response):
        data = json.loads(response.text)
        converter = html2text.HTML2Text()
        for job in data['hits']['hits']:
            item = JobItem()
            item['url'] = urljoin(
                "https://www.workingnomads.co/jobs/",
                job['_source']['slug'])
            item['title'] = job['_source']['title']
            item['site'] = 'WorkingNomads'
            item['text'] = converter.handle(job['_source']['description'])
            item['text'] = [item['text'] + ' '.join(item.get('tags', []))]
            try:
                posted = converter.handle(job['_source']['pub_date'])
                item['date_posted'] = posted.split('+')[0]
            except Exception as e:
                self.logger.error(e)
            yield item
项目:code    作者:ActiveState    | 项目源码 | 文件源码
def textwindow(url):
    title = url
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    s = gethtml(url)
    s = h.handle(s)
    s = h.unescape(s)
    text = convert65536(s)
    top = Tkinter.Toplevel()
    top.geometry("+200+100")
    top.title(title)
    top.bind("<Escape>", lambda _ : top.destroy())
    S = Tkinter.Scrollbar(top)
    customFont = tkFont.Font(family="Arial", size=16)
    T = TextPlus(top,height=20,width=78,font=customFont,bg="lightgrey")
    S.pack(side=Tkinter.RIGHT,fill=Tkinter.Y)
    T.pack(side=Tkinter.LEFT,fill=Tkinter.Y)
    S.config(command=T.yview)
    T.config(yscrollcommand=S.set)
    T.insert(Tkinter.END,text)
项目:RedditNewsPostBot    作者:LFlare    | 项目源码 | 文件源码
def replace_markdown(self, html):
        h2t = html2text.HTML2Text()
        h2t.body_width = 0
        h2t.default_image_alt = "IMAGE"
        return h2t.handle(html)
项目:cerberus-core    作者:ovh    | 项目源码 | 文件源码
def dehtmlify(body):
    """
        Try to dehtmlify a text

        :param str body: The html content
        :rtype: str
        :return: The dehtmlified content
    """
    html = html2text.HTML2Text()
    html.body_width = 0
    body = html.handle(body.replace('\r\n', '<br/>'))
    body = re.sub(r'^(\s*\n){2,}', '\n', body, flags=re.MULTILINE)
    return body
项目:cerberus-core    作者:ovh    | 项目源码 | 文件源码
def get_dehtmlified(report_id):
    """ Get raw email of report
    """
    try:
        report = Report.objects.get(id=report_id)
        html = html2text.HTML2Text()
        html.body_width = 0
        body = html.handle(report.body.replace('\r\n', '<br/>'))
        body = re.sub(r'^(\s*\n){2,}', '\n', body, flags=re.MULTILINE)
        return {'dehtmlify': body}
    except (ObjectDoesNotExist, ValueError):
        raise NotFound('Report not found')
项目:m2mb    作者:thperret    | 项目源码 | 文件源码
def format_mail(loop, msg, to_text=True, ignore_tables=True):
    """Format the mail to markdown

    Parameter
    ---------
    msg: email.message
    to_text: bool, optional
        Convert text/html mails to text/plain with markdown formatting

    Returns
    -------
    text: str
    """

    h = html2text.HTML2Text()
    h.ignore_tables = ignore_tables

    body = None
    for part in msg.walk():
        if to_text and part.get_content_type() == "text/html":
            body = h.handle(quopri.decodestring(part.get_payload()).decode())
            break
        elif part.get_content_type() == "text/plain":
            body = quopri.decodestring(part.get_payload())
            break

    if not body:
        log.error("Could not find text body mail")
        body = quopri.decodestring(msg.as_string())

    text = f"### {msg['Subject']} \n {body}"
    return text
项目:evernote-exporter    作者:shawndaniel    | 项目源码 | 文件源码
def edit_file(self, full_path, filename, to_zim=False):
        text_maker = html2text.HTML2Text()

        with open(full_path, 'r') as f:
            html = f.read()
        content = ''
        if html:
            try:
                content = text_maker.handle(unicode(html, errors='ignore'))
                content = content.encode('ascii', 'ignore')
                content = content.split('\00')[0]   # remove null chars
                content = content.replace('\.', '.')    # remove escape chars
            except Exception as e:
                self._exception('convert content of note to markdown', full_path, e)
        else:
            content = ''

        if to_zim:
            content = self.to_zim_syntax(content)

        fn_path = self._rename_file(full_path, filename)
        with open(fn_path, 'w') as f:
            try:
                f.write(content.encode('ascii', 'ignore'))
            except Exception as e:
                self._exception('save note', fn_path, e)
        return
项目:CodeGra.de    作者:CodeGra-de    | 项目源码 | 文件源码
def send_reset_password_email(user: models.User) -> None:
    token = user.get_reset_token()
    html_body = current_app.config['EMAIL_TEMPLATE'].replace(
        '\n\n', '<br><br>'
    ).format(
        site_url=current_app.config["EXTERNAL_URL"],
        url=f'{psef.app.config["EXTERNAL_URL"]}/reset_'
        f'password/?user={user.id}&token={token}',
        user_id=user.id,
        token=token,
        user_name=html.escape(user.name),
        user_email=html.escape(user.email),
    )
    text_maker = html2text.HTML2Text(bodywidth=78)
    text_maker.inline_links = False
    text_maker.wrap_links = False

    message = Message(
        subject=f'Reset password on {psef.app.config["EXTERNAL_URL"]}',
        body=text_maker.handle(html_body),
        html=html_body,
        recipients=[user.email],
    )
    try:
        mail.send(message)
    except Exception:
        raise APIException(
            'Something went wrong sending the email, '
            'please contact your site admin',
            f'Sending email to {user.id} went wrong.',
            APICodes.UNKOWN_ERROR,
            500,
        )
项目:DLink_Harvester    作者:MikimotoH    | 项目源码 | 文件源码
def dom2text(dom, ignore_images=True, ignore_emphasis=True, ignore_tables=True):
    from lxml import etree
    import html2text
    htt = html2text.HTML2Text()
    htt.body_width = 0
    htt.ignore_images = ignore_images
    htt.ignore_emphasis = ignore_emphasis
    htt.ignore_tables = ignore_tables
    return htt.handle(etree.tostring(dom).decode())
项目:fire    作者:FundersClub    | 项目源码 | 文件源码
def reply_from_template(self, template_name, extra_context=None, html=False):
        context = {
            'msg': self,
            'settings': settings,
        }
        if extra_context:
            context.update(extra_context)

        body = render_to_string(template_name, context)
        subject = 'Re: ' + self.subject
        to = '{} <{}>'.format(self.from_name, self.from_email) if self.from_name else self.from_email

        if html:
            h = html2text.HTML2Text(bodywidth=0)
            text_content = h.handle(body)
            msg = EmailMultiAlternatives(subject, text_content, settings.DEFAULT_FROM_EMAIL, [to])
            msg.attach_alternative(body, "text/html")
            msg.send(fail_silently=False)
        else:
            return send_mail(
                subject,
                body,
                settings.DEFAULT_FROM_EMAIL,
                [to],
                fail_silently=False,
            )
项目:wechannel    作者:PegasusWang    | 项目源码 | 文件源码
def html2markdown(html):
    """html is unicode"""
    if not html:
        return html
    h = html2text.HTML2Text()
    h.ignore_images = True
    h.ignore_links = True
    return h.handle(html)
项目:markbj    作者:chaijunit    | 项目源码 | 文件源码
def html2markdown(html, url, download_image, image_path):
    if not download_image:
        h = HTML2Text(baseurl = url, bodywidth = 0)
    else:
        html = download_html_image(url, html, image_path)
        h = HTML2Text(bodywidth = 0)
    md = h.handle(html)
    return md
项目:webmon    作者:KarolBedkowski    | 项目源码 | 文件源码
def _filter(self, item: str, result: common.Result) -> ty.Iterable[str]:
        assert isinstance(item, str)
        try:
            import html2text as h2t
        except ImportError:
            raise common.FilterError(self, "module html2text not found")

        conv = h2t.HTML2Text(bodywidth=self._conf.get("width"))
        yield conv.handle(item)
项目:pythonista-scripts    作者:khilnani    | 项目源码 | 文件源码
def main():
    speech.stop()
    if not appex.is_running_extension():
        console.hud_alert('Reading clipboard')
        text = clipboard.get()
        url = None
    else:
        text = appex.get_text()
        url = appex.get_url()

    if url == None:
        try:
            url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0]
        except:
            pass

    if url != None:
        console.hud_alert('Reading: ' + url)
        h = html2text.HTML2Text()
        try:
            r = requests.get(
            url=url,
            headers={"User-agent": "Mozilla/5.0{0:06}".format(random.randrange(999999))})
        except requests.ConnectionError as e:
            console.alert('Unable to connect to url.')
            return True
        html_content = r.text.decode('utf-8')
        text = html2text.html2text(html_content)
    else:
        console.hud_alert('Reading text: ' + str(text))

    if text:
        speech.say(text)
        stop = console.alert('Done?', hide_cancel_button=True, button1='OK')
        speech.stop()
    else:
        console.hud_alert('No text found.')
项目:pythonista-scripts    作者:khilnani    | 项目源码 | 文件源码
def main():
    if appex.is_running_extension():
        url = appex.get_url()
        if url == None:
            text = appex.get_text()
            url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0]
    else:
        text = clipboard.get().strip()
        url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0]
        if not "http" in url:
            url = "http://"
        try:
            url = console.input_alert("URL", "", url)
        except:
            return True

    console.hud_alert('URL: %s' % url)

    h = html2text.HTML2Text()
    try:
        r = requests.get(
            url=url,
            headers={"User-agent": "Mozilla/5.0{0:06}".format(random.randrange(999999))}
        )
    except Exception as e:
        raise(e.message)
        return True

    html_content = r.text.decode('utf-8')
    rendered_content = html2text.html2text(html_content)
    clipboard.set(rendered_content)

    launch_e = console.alert('Markdown copied to clipboard. Launch Evernote?', button1='Yes', button2='No', hide_cancel_button=True)
    if launch_e ==1:
        _eurl = "evernote://x-callback-url/new-note?type=clipboard&title=DRAFT&text="
        app=UIApplication.sharedApplication()
        eurl=nsurl(_eurl)
        app.openURL_(eurl)
    appex.finish()
项目:Pythonista_scripts    作者:wizardofozzie    | 项目源码 | 文件源码
def main():
    if appex.is_running_extension():
        url = appex.get_url()
    else:
        url = clipboard.get().strip()
        if not RE_URL.match(url):
            try:
                url = console.input_alert("Enter gamefaqs URL", "", "https://www.gamefaqs.com/")
            except KeyboardInterrupt:
                sys.exit(0)

    newurl = "{0}?print=1".format(url)
    #baseurl = http://www.gamefaqs.com/ps3/959558-fallout-new-vegas/faqs/61226
    if RE_URL.match(url):
        h = html2text.HTML2Text()
        r = requests.get(
                         url=newurl, 
                         headers={"User-agent": "Mozilla/5.0{0:06}".format(random.randrange(999999))}
                         )
        html_content = r.text.decode('utf-8')
        rendered_content = html2text.html2text(html_content)
        filename = url.partition("gamefaqs.com/")[-1].partition("/")[-1].partition("/faqs")[0]+".txt"
        filepath = os.path.join(os.path.expanduser("~/Documents"), filename)

        with open(filepath, "w") as fo:
            fo.write(rendered_content)

        console.hud_alert('Success! Saved {0}'.format(filename), "success")
项目:web-search-engine    作者:AnthonySigogne    | 项目源码 | 文件源码
def detect_language(html) :
    """
    Detect the language of the text content of a page.
    """
    # handle string, need bytes
    try :
        html = html.decode("utf8")
    except :
        try :
            html = html.decode("latin1")
        except :
            pass
    h = html2text.HTML2Text()
    return langdetect.detect(h.handle(html))
项目:ahmia-crawler    作者:ahmia    | 项目源码 | 文件源码
def html2string(self, response):
        """HTML 2 string converter. Returns a string."""
        converter = html2text.HTML2Text()
        converter.ignore_links = True
        encoding = self.detect_encoding(response)
        decoded_html = response.body.decode(encoding, 'ignore')
        string = converter.handle(decoded_html)
        return string
项目:ctutlz    作者:theno    | 项目源码 | 文件源码
def _text_from_html(html):
    h2t = html2text.HTML2Text()
    h2t.ignore_links = True
    return h2t.handle(html)
项目:uptec-menu    作者:manelmadeira    | 项目源码 | 文件源码
def get_html(filename):
    html_text = None

    h = html2text.HTML2Text()
    with io.open('pdf/' + filename + '.html', 'r', encoding='utf-8') as fp:
        content = fp.read()
        html_text = h.handle(content)

    return html_text
项目:fire    作者:FundersClub    | 项目源码 | 文件源码
def msg_to_markdown(repo, msg):
    def absurl(url):
        if not url.startswith('http:/') and not url.startswith('https:'):
            slash = '' if settings.BASE_URL.endswith('/') or url.startswith('/') else '/'
            return settings.BASE_URL + slash + url
        return url

    # Need a map of content id -> attachment
    all_attachments = list(msg.attachment_set.all())
    attachments_map = {}
    for att in all_attachments:
        if att.content_id:
            attachments_map[att.content_id] = att

    # Attempt to update img elements pointing to an attach,ment
    attachments_observed = set()
    if msg.body_html:
        soup = BeautifulSoup(msg.body_html, 'html.parser')
        for img in soup.find_all('img'):
            src = img.attrs.get('src')
            if not src or not src.startswith('cid:'):
                continue

            att = attachments_map.get(src.replace('cid:', ''))
            if att:
                img['src'] = att.file.url
                attachments_observed.add(att)

        h = html2text.HTML2Text(bodywidth=0)
        msg_body = h.handle(str(soup))
    else:
        msg_body = msg.body_text

    # Look for attachments we didn't display inline
    attachments = list(att for att in all_attachments if att not in attachments_observed)
    if attachments:
        attachments_text = u'\n\n\n\n---\n*Attachments:*\n\n'
        for att in attachments:
            url = att.file.url
            filename = os.path.basename(att.file.name)
            inline_img = ''
            if filename.lower().split('.')[-1] in ('png', 'gif', 'jpeg', 'jpg' 'svg'):
                inline_img = u'\n  ![]({})\n'.format(url)
            attachments_text += u'1. [{}]({}){}\n'.format(filename, url, inline_img)
    else:
        attachments_text = ''

    # See if we recognize this email address
    map_entry = repo.emailmap_set.filter(email__iexact=msg.from_email).first()
    if map_entry:
        tag = '@' + map_entry.login
    else:
        tag = msg.from_name

    return u'*Sent by {} ({}). Created by [fire]({}/).*\n\n---\n{}{}'.format(
        tag,
        msg.from_email,
        settings.BASE_URL,
        msg_body,
        attachments_text,
    )