Java 类org.jsoup.nodes.Document 实例源码

项目:WebtoonDownloadManager    文件:ManualController.java   
/**
 * 웹툰조회
 */
public void getWebtoon(String code) {

    if (!"".equals(code)) {
        CommonService cs = new CommonService();

        Connection conn = cs.getConnection(code);
        conn.timeout(5000);

        Document doc = null;

        codeInputField.setText(code);
        wDesc.setWrapText(true);

        try {

            doc = conn.get();

            String title = doc.select("title").text().split("::")[0];
            setTitle(title);

            String author = doc.select("div.detail h2 > span").text();
            wTitle.setText(title + "(" + author + ")");

            String desc = doc.select("div.detail p").text();
            wDesc.setText(desc);

            String img = doc.select("div.thumb > a img").attr("src");
            thumbnail.setImage(new Image(img, true));

        } catch (Exception e) {
            e.printStackTrace();
        }
    } else {
        Platform.runLater(new Runnable() {
            @Override
            public void run() {
                AlertSupport alert = new AlertSupport("웹툰코드를 입력하세요.");
                alert.alertInfoMsg(stage);
            }
        });
    }
}
项目:NewKakaoBot    文件:ScriptUtil.java   
@JSStaticFunction
public static void parseToText(final String url, final String option, final Function func) throws IOException {
    new Thread(new Runnable() {
        @Override
        public void run() {
            Document document = null;
            try {
                document = Jsoup.connect(url).get();
                Elements element = document.select(option);

                func.call(context, scope, scope, new Object[] { element.text(), null });
            } catch (IOException e) {
                try {
                    func.call(context, scope, scope, new Object[] { null, e });
                } catch (Exception err) {}
            }
        }
    }).start();
}
项目:ripme    文件:XhamsterRipper.java   
@Override
public List<String> getURLsFromPage(Document doc) {
    List<String> result = new ArrayList<>();
    for (Element thumb : doc.select("div.picture_view > div.pictures_block > div.items > div.item-container > a > div.thumb_container > div.img > img")) {
        String image = thumb.attr("src");
        // replace thumbnail urls with the urls to the full sized images
        image = image.replaceAll(
                "https://upt.xhcdn\\.",
                "http://up.xhamster.");
        image = image.replaceAll("ept\\.xhcdn", "ep.xhamster");
        image = image.replaceAll(
                "_160\\.",
                "_1000.");
        // Xhamster has bad cert management and uses invalid certs for some cdns, so we change all our requests to http
        image = image.replaceAll("https", "http");
        result.add(image);
    }
    return result;
}
项目:crawler-jsoup-maven    文件:JsoupUtil.java   
/**
 * 方法说明:绑定单cookie模拟浏览器,返回document对象
 * 
 * @param url           被访问url
 * @param cookieKey     绑定cookie的key
 * @param cookieValue   绑定cookie的value
 * @return Document     返回document对象
 * @throws Exception
 */
public static Document getDocumentWithCookie(String url, String cookieKey, String cookieValue) throws Exception {

    Document doc = null;

    if (StringUtil.isEmpty(cookieKey) && StringUtil.isEmpty(cookieValue)) {
        doc = getDocument(url);
    } else if (!StringUtil.isEmpty(cookieKey) && !StringUtil.isEmpty(cookieValue)){
        Map<String, String> cookiesMap = new HashMap<String, String>();
        cookiesMap.put(cookieKey, cookieValue);
        doc = getDocumentWithCookies(url, cookiesMap);
    } else {
        // parameter is error. 参数が不正である、所传参数错误。
        throw new IllegalArgumentException("key or value is err"); // TODO hard coding is fixing bluetata 2017/03/20 add
    }
    return doc;
}
项目:crawling-framework    文件:DateParser.java   
public static List<MatchedDate> extractFromProperties(Document document) {
    List<MatchedDate> result = Lists.newArrayList();

    for (String selector : ITEMPROP_SELECTORS) {
        document.select(selector).forEach(m -> {
            String datetime = m.attr("datetime");
            String content = m.attr("content");
            String title = m.attr("title");
            if (!Strings.isNullOrEmpty(datetime)) {
                result.add(new MatchedDate(datetime, selector));
            } else if (!Strings.isNullOrEmpty(content)) {
                result.add(new MatchedDate(content, selector));
            } else if (!Strings.isNullOrEmpty(title)) {
                result.add(new MatchedDate(title, selector));
            }
        });
    }

    return result;
}
项目:PicKing    文件:XiuMM.java   
@Override
public Map<ContentsActivity.parameter, Object> getContent(String baseUrl, String currentUrl, byte[] result, Map<ContentsActivity.parameter, Object> resultMap) throws UnsupportedEncodingException {
    List<AlbumInfo> urls = new ArrayList<>();
    Document document = Jsoup.parse(new String(result, "utf-8"));
    Elements elements = document.select("div.album");
    for (Element element : elements) {
        AlbumInfo temp = new AlbumInfo();

        Elements title = element.select("span.name");
        if (title.size() > 0)
            temp.setTitle(title.get(0).text());

        Elements album = element.select(".pic_box a");
        temp.setAlbumUrl(album.attr("href"));
        Elements pic = album.select("img");
        if (pic.size() > 0)
            temp.setPicUrl(pic.get(0).attr("src"));
        urls.add(temp);
    }
    resultMap.put(ContentsActivity.parameter.CURRENT_URL, currentUrl);
    resultMap.put(ContentsActivity.parameter.RESULT, urls);
    return resultMap;
}
项目:Linda-AI    文件:Act.java   
static void Wikipedia(String dico) {
    Document significatowikipedia = null;
    String cercowikipedia = dico.substring((dico.indexOf("'")) + 1, (dico.lastIndexOf("'")));
    try {
        significatowikipedia = Jsoup.connect("https://it.wikipedia.org/wiki/" + cercowikipedia.replace(" ", "_")).userAgent("Mozilla").get();
        String divs = significatowikipedia.select("p").text();
        if (!divs.equals("")) {
            new GUI().giveResponse("La ricerca di " + cercowikipedia + " su wikipedia ha restituito il seguente risultato:" + '\n' + divs);
        } else {
            new GUI().giveResponse("Mi dispiace, non ho trovato informazioni su " + cercowikipedia + " su Wikipedia...");
        }
    } catch (HttpStatusException e) {
        new GUI().giveResponse("Mi dispiace, Wikipedia sembra non avere una voce per '" + cercowikipedia +"'...");
    } catch (java.io.IOException f) {
        f.printStackTrace();
    } catch (StringIndexOutOfBoundsException g) {
        new GUI().giveResponse("Ricorda che, perché io cerchi informazioni riguardo a qualcosa, occorre che tu la definisca fra due virgolette!");
    }
}
项目:catpeds    文件:PawpedsDocumentParserTest.java   
/**
 * Test that {@link PawpedsDocumentParser#parseSearch(Document)} throws an
 * {@link IllegalArgumentException} if there is an jsoup parsing error.
 */
@Test(expected = IllegalArgumentException.class)
public void testJsoupSelectorUnexpectedError() throws Exception {
    // Given
    Document document = mock(Document.class);

    Elements noErrorElement = mock(Elements.class);
    when(noErrorElement.text()).thenReturn("");
    when(document.select("th.error")).thenReturn(noErrorElement);

    when(document.select("table.searchresult tr.searchresult:has(td.searchresult)")).thenThrow(SelectorParseException.class);

    // When
    pawpedsDocumentParser.parseSearch(document);

    // Then
    // the exception is expected
}
项目:ConnectU    文件:AnunciosRequest.java   
public void loadAnuncios(final AnunciosCallback callback) {
    UAWebService.HttpWebGetRequest(context, ANUNCIOS_URL, new UAWebService.WebCallBack() {
        @Override
        public void onNavigationComplete(boolean isSuccessful, String body) {
            if (isSuccessful) {
                Document doc = Jsoup.parse(body);
                //Get Post data
                Element anuncios = doc.select(ANUNCIOS_LIST_BODY).first();
                try {
                    for (Element anuncio : anuncios.children()) {
                        parseAnuncio(anuncio, "");
                    }
                    callback.onResult(true, "");
                } catch (NullPointerException e) {
                    FirebaseCrash.log(body);
                    FirebaseCrash.report(e);
                    callback.onResult(false, ErrorManager.LOGIN_REJECTED); //Usually because session ended!
                }
            } else {
                callback.onResult(false, body);
            }
        }
    });
}
项目:JsoupSample    文件:ImageListPresenterImpl.java   
@Override
public List<ImageModel> getT(Document document) {
    if (view == null) {
        return new ArrayList<>();
    }
    switch (view.getType()) {
        case ApiConfig.Type.DOU_BAN_MEI_ZI:
            return JsoupDoubanManager.get(document).getImageList();
        case ApiConfig.Type.KK:
            return JsoupKKManager.get(document).getImageList();
        case ApiConfig.Type.M_ZI_TU:
            return JsoupMZiTuManager.get(document).getImageList();
        case ApiConfig.Type.MM:
            return JsoupMMManager.get(document).getImageList();
        case ApiConfig.Type.MEIZITU:
            return JsoupMeiZiTuManager.get(document).getImageList();
        default:
            return new ArrayList<>();
    }
}
项目:ripme    文件:DeviantartRipper.java   
@Override
public List<String> getDescriptionsFromPage(Document page) {
    List<String> textURLs = new ArrayList<>();
    // Iterate over all thumbnails
    for (Element thumb : page.select("div.zones-container span.thumb")) {
        logger.info(thumb.attr("href"));
        if (isStopped()) {
            break;
        }
        Element img = thumb.select("img").get(0);
        if (img.attr("transparent").equals("false")) {
            continue; // a.thumbs to other albums are invisible
        }
        textURLs.add(thumb.attr("href"));

    }
    return textURLs;
}
项目:ripme    文件:FlickrRipper.java   
/**
 * Login to Flickr.
 * @return Cookies for logged-in session
 * @throws IOException
 */
@SuppressWarnings("unused")
private Map<String,String> signinToFlickr() throws IOException {
    Response resp = Jsoup.connect("http://www.flickr.com/signin/")
                        .userAgent(USER_AGENT)
                        .followRedirects(true)
                        .method(Method.GET)
                        .execute();
    Document doc = resp.parse();
    Map<String,String> postData = new HashMap<>();
    for (Element input : doc.select("input[type=hidden]")) {
        postData.put(input.attr("name"),  input.attr("value"));
    }
    postData.put("passwd_raw",  "");
    postData.put(".save",   "");
    postData.put("login",   new String(Base64.decode("bGVmYWtlZGVmYWtl")));
    postData.put("passwd",  new String(Base64.decode("MUZha2V5ZmFrZQ==")));
    String action = doc.select("form[method=post]").get(0).attr("action");
    resp = Jsoup.connect(action)
                .cookies(resp.cookies())
                .data(postData)
                .method(Method.POST)
                .execute();
    return resp.cookies();
}
项目:android-apps    文件:JsoupUtil.java   
private static Map meiyuxsCatalog(Map map, String url) {
  try {
    List data = new ArrayList();
    Document document = Jsoup
        .connect(url)
        .userAgent(FormatUtil.USER_AGENT_PC)
        .get();
    Element body = document.body();
    Elements catalogEles = body.getElementsByClass("list-group-item");
    for (Element catalogE : catalogEles) {
      if (catalogE.getElementsByTag("a").size() > 0) {
        Map<String, Object> _map = new HashMap<>();
        _map.put("catalog", catalogE.text());
        _map.put("href", "http://www.meiyuxs.com" + catalogE.getElementsByTag("a").first().attr("href"));
        data.add(_map);
      }
    }
    map.put("data", data);
    map.put("cover", "");
    map.put("lastChapter", ((Map) data.get(data.size() - 1)).get("catalog").toString());
  } catch (IOException e) {
    e.printStackTrace();
  }
  return map;
}
项目:PicKing    文件:Mntu92.java   
@Override
public Map<DetailActivity.parameter, Object> getDetailContent(String baseUrl, String currentUrl, byte[] result, Map<DetailActivity.parameter, Object> resultMap) throws UnsupportedEncodingException {
    List<PicInfo> urls = new ArrayList<>();
    Document document = Jsoup.parse(new String(result, "utf-8"));
    PicInfo picInfo = new PicInfo();
    Elements elements = document.select("#bigpic img");
    for (Element element : elements) {
        picInfo.setPicUrl(baseUrl + element.attr("src"));
    }
    Elements title = document.select("#entry h1");
    if (title.size() > 0)
        picInfo.setTitle(title.text());
    Elements tags = document.select(".postinfo a");
    if (tags.size() > 0) {
        List<String> tagList = new ArrayList<>();
        for (Element t : tags)
            tagList.add(t.text());
        picInfo.setTags(tagList);
    }
    urls.add(picInfo);

    resultMap.put(DetailActivity.parameter.CURRENT_URL, currentUrl);
    resultMap.put(DetailActivity.parameter.RESULT, urls);
    return resultMap;
}
项目:android-apps    文件:JsoupUtil.java   
private static Map qulaCatalog(Map map, String url) {
  try {
    List data = new ArrayList();
    Document document = Jsoup
        .connect(url)
        .userAgent(FormatUtil.USER_AGENT_PC)
        .get();
    Element body = document.body();
    Elements catalogEles = body.getElementById("list").getElementsByTag("dd");
    for (Element catalogE : catalogEles) {
      if (catalogE.getElementsByTag("a").size() > 0) {
        Map<String, Object> _map = new HashMap<>();
        _map.put("catalog", catalogE.text());
        _map.put("href", url + catalogE.getElementsByTag("a").first().attr("href"));
        data.add(_map);
      }
    }
    String cover = body.getElementById("fmimg").getElementsByTag("img").first().attr("src");
    map.put("data", data);
    map.put("cover", cover);
    map.put("lastChapter", ((Map) data.get(data.size() - 1)).get("catalog").toString());
  } catch (IOException e) {
    e.printStackTrace();
  }
  return map;
}
项目:wulkanowy    文件:StudentAndParentTestCase.java   
protected StudentAndParent getSnp(String fixtureFileName) throws Exception {
    String input = FixtureHelper.getAsString(getClass().getResourceAsStream(fixtureFileName));

    Document tablePageDocument = Jsoup.parse(input);

    StudentAndParent snp = Mockito.mock(StudentAndParent.class);
    Mockito.when(snp.getSnPPageDocument(Mockito.anyString()))
            .thenReturn(tablePageDocument);
    Mockito.when(snp.getSemesters(Mockito.any(Document.class))).thenCallRealMethod();
    Mockito.when(snp.getCurrentSemester(Mockito.<Semester>anyList()))
            .thenCallRealMethod();
    Mockito.when(snp.getRowDataChildValue(Mockito.any(Element.class),
            Mockito.anyInt())).thenCallRealMethod();

    return snp;
}
项目:fantalegheEV    文件:RemoteController.java   
private Map<Integer, List<TeamResult>> getResults(String leagueName) throws IOException{
    String url = GlobalConfiguration.baseURL + leagueName + GlobalConfiguration.calendarSuffix;
    Document doc = Jsoup.connect(url).get();
    Map<Integer, List<TeamResult>> results = new HashMap<>();
    Elements calendarDays = doc.select(".table");
    Iterator it = calendarDays.iterator();
    int day = 1;
    while (it.hasNext()) {
        Element calendarDay = (Element) it.next();
        if (calendarDay.children().is(".greyfoot")) {
            break;
        }
        List<TeamResult> teamResults = new ArrayList<>();
        for (Element match : calendarDay.select(".match")) {
            teamResults.add(new TeamResult(match.children().get(0).text(), Double.parseDouble(match.children().get(1).text().replace(",", "."))));
            teamResults.add(new TeamResult(match.children().get(3).text(), Double.parseDouble(match.children().get(2).text().replace(",", "."))));
        }
        results.put(day, teamResults);
        day++;
    }
    return results;
}
项目:PicKing    文件:AKabe.java   
@Override
public Map<DetailActivity.parameter, Object> getDetailContent(String baseUrl, String currentUrl, byte[] result, Map<DetailActivity.parameter, Object> resultMap) throws UnsupportedEncodingException {
    List<PicInfo> urls = new ArrayList<>();
    Document document = Jsoup.parse(new String(result, "utf-8"));

    String sTitle = "";
    Elements title = document.select("#header h1");
    if (title.size() > 0)
        sTitle = title.get(0).text();

    List<String> tagList = new ArrayList<>();
    Elements tags = document.select("ul.tagList a");
    if (tags.size() > 0)
        for (Element tag : tags)
            tagList.add(tag.text());

    Elements elements = document.select("ul.gallery li:has(img)");
    for (Element element : elements) {
        urls.add(new PicInfo(element.attr("data-src")).setTitle(sTitle).setTags(tagList));
    }

    resultMap.put(DetailActivity.parameter.CURRENT_URL, currentUrl);
    resultMap.put(DetailActivity.parameter.RESULT, urls);
    return resultMap;
}
项目:ovh-java-sdk    文件:ApiOvhUtils.java   
/**
 * Create a new OVH Application using https://eu.api.ovh.com/createApp/
 * Outout the Application Key and Application Secret in std-out
 * @param nic
 * @param password
 * @throws IOException
 */
public void createApplication(String nic, String password) throws IOException {
    String url = "https://eu.api.ovh.com/createApp/";
    Document doc = Jsoup.connect(url)
            .data("nic", nic)
            .data("password", password)
            .data("applicationName", "One Shoot Token")
            .data("applicationDescription", "One Shoot Token")
            .post();
    String body = doc.toString();
    Pattern extract = Pattern.compile(" Application (\\w+)<pre><name>([^<]+)</name></pre>");
    Matcher m = extract.matcher(body);
    String Key = null;
    String Secret = null;
    while (m.find()) {
        String k = m.group(1);
        String v = m.group(2);
        if (k.equals("Key"))
            Key = v;
        if (k.equals("Secret"))
            Secret = v;
    }
    log.warn("Key:{} Secret:{}", Key, Secret);
}
项目:crawler-jsoup-maven    文件:JsoupUtil.java   
/**
 * 方法说明:绑定单data(parameter)模拟浏览器,并返回document对象
 * 
 * @param url           被访问的url
 * @param dataKey       parameter的key
 * @param dataValue     parameter的value
 * @return Document     返回document对象
 * @throws Exception
 */
public static Document getDocumentWithData(String url, String dataKey, String dataValue) throws Exception {

    Document doc = null;

    if (StringUtil.isEmpty(dataKey) && StringUtil.isEmpty(dataValue)) {
        doc = getDocument(url);
    } else if (!StringUtil.isEmpty(dataKey) && !StringUtil.isEmpty(dataValue)){
        Map<String, String> dataMap = new HashMap<String, String>();
        dataMap.put(dataKey, dataValue);
        doc = getDocumentWithData(url, dataMap);
    } else {
        // parameter is error. 参数が不正である、所传参数错误。
        throw new IllegalArgumentException("key or value is err"); // TODO hard coding is fixing bluetata 2017/03/20 add
    }
    return doc;
}
项目:ripme    文件:FlickrRipper.java   
private Document getLargestImagePageDocument(URL url) throws IOException {
    // Get current page
    Document doc = Http.url(url).get();
    // Look for larger image page
    String largestImagePage = this.url.toExternalForm();
    for (Element olSize : doc.select("ol.sizes-list > li > ol > li")) {
        Elements ola = olSize.select("a");
        if (ola.size() == 0) {
            largestImagePage = this.url.toExternalForm();
        }
        else {
            String candImage = ola.get(0).attr("href");
            if (candImage.startsWith("/")) {
                candImage = "http://www.flickr.com" + candImage;
            }
            largestImagePage = candImage;
        }
    }
    if (!largestImagePage.equals(this.url.toExternalForm())) {
        // Found larger image page, get it.
        doc = Http.url(largestImagePage).get();
    }
    return doc;
}
项目:desktop    文件:Main.java   
public void showWord() {
    try {
        String language;
        Languages l;
        l = (Languages) cmbLanguage.getSelectedItem();
        language = l.getLang();
        Document doc = Jsoup.connect("http://evilinsult.com/generate_insult.php?lang=" + language).get();
        Elements links = doc.select("body");
        for (Element link : links) {
            txtPaneShow.setText("\n" + link.text());
        }
    } catch (RuntimeException e) {
        throw e;
    } catch (Exception ex) {
        txtPaneShow.setText("\n" + "Insult Outage! Please Check Your Internet Connection And Try Again In Three Minutes");
    }
}
项目:ripme    文件:FlickrRipper.java   
@Override
public void run() {
    try {
        Document doc = getLargestImagePageDocument(this.url);
        Elements fullsizeImages = doc.select("div#allsizes-photo img");
        if (fullsizeImages.size() == 0) {
            logger.error("Could not find flickr image at " + doc.location() + " - missing 'div#allsizes-photo img'");
        }
        else {
            String prefix = "";
            if (Utils.getConfigBoolean("download.save_order", true)) {
                prefix = String.format("%03d_", index);
            }
            synchronized (flickrThreadPool) {
                addURLToDownload(new URL(fullsizeImages.first().attr("src")), prefix);
            }
        }
    } catch (IOException e) {
        logger.error("[!] Exception while loading/parsing " + this.url, e);
    }
}
项目:ChineseStarsRelationship    文件:Spider.java   
private static void downloadSummary(String name, Document doc) {

    // 写文件
    try {
        String summary = doc.select(".summary p").first().text();
        logger.debug(summary);
        FileUtils.write(new File(rootPath + "Summary.data"), name + "\n" + summary + "\n", "utf-8", true);
    } catch (Exception e) {
        logger.error("个人描述信息 写入:【" + name + "】\t失败!");
        try {
            FileUtils.write(new File(rootPath + "ErrorSummary.data"), name + "\t" + e.toString() + "\n", "utf-8", true);
        } catch (IOException e1) {
            e1.printStackTrace();
        }
    }
}
项目:InComb    文件:RssItem.java   
/**
 * Tries to parse the image url out of the description. If it fails <code>null</code> will be returned.
 * @return Image url in the description. <code>Null</code> if no description was found.
 */
private String parseImage() {
    final Document doc = Jsoup.parse(getDescription());
    final Elements imgs = doc.getElementsByTag("img");
    for (final Element img : imgs) {
        String src = img.attr("src");
        if(StringUtils.isNotBlank(src)) {

            if(src.startsWith("/")) {
                try {
                    final URL feedUrl = new URL(document.getContentSource().getUrl());
                    src = "//" + feedUrl.getHost() + src;
                } catch (final MalformedURLException e) {
                    // next
                    continue;
                }
            }

            return src;
        }
    }

    return null;
}
项目:ripme    文件:ImagebamRipper.java   
private void fetchImage() {
    try {
        Document doc = Http.url(url).get();
        // Find image
        Elements images = doc.select(".image-container img");
        if (images.size() == 0) {
            logger.warn("Image not found at " + this.url);
            return;
        }
        Element image = images.first();
        String imgsrc = image.attr("src");
        logger.info("Found URL " + imgsrc);
        // Provide prefix and let the AbstractRipper "guess" the filename
        String prefix = "";
        if (Utils.getConfigBoolean("download.save_order", true)) {
            prefix = String.format("%03d_", index);
        }
        addURLToDownload(new URL(imgsrc), prefix);
    } catch (IOException e) {
        logger.error("[!] Exception while loading/parsing " + this.url, e);
    }
}
项目:nbaScorePull    文件:ExampleUnitTest.java   
@Test
public void team_name_isValid() throws Exception {
    String url = "https://www.basketball-reference.com/boxscores/201706120GSW.html";
    Document doc = Jsoup.connect(url).get();
    Element line_score = doc.getElementById("div_line_score");
    String away_team_name = line_score.getElementsByTag("tr").get(1).getElementsByTag("td").get(1).text();
    assertEquals(away_team_name, "CLE");
}
项目:nifi-nars    文件:GetWebpage.java   
/**
 * Uses Jsoup to convert from HTML to XHTML
 */
private byte[] formatToXHtml(String html, Charset charset) {
    Document document = Jsoup.parseBodyFragment(html);
    document.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
    document.outputSettings().charset(charset);
    return document.toString().getBytes(charset);
}
项目:ripme    文件:BcfakesRipper.java   
@Override
public List<String> getURLsFromPage(Document doc) {
    List<String> imageURLs = new ArrayList<>();
    for (Element thumb : doc.select("div.ngg-gallery-thumbnail > a > img")) {
        String imageURL = thumb.attr("src");
        imageURL = imageURL.replace("thumbs/thumbs_", "");
        imageURLs.add(imageURL);
    }
    return imageURLs;
}
项目:nixmash-blog    文件:JsoupServiceImpl.java   
private Document getDocument(String url, Boolean validateCert)
        throws IOException {
    return Jsoup.connect(url)
            .userAgent(userAgent)
            .timeout(12000)
            .referrer("http://www.google.com")
            .followRedirects(true)
            .ignoreHttpErrors(true)
            .ignoreContentType(true)
            .validateTLSCertificates(validateCert)
            .get();
}
项目:jsouplib    文件:AJsoupResponseBodyConverter.java   
@Override
public T convert(ResponseBody value) throws IOException {
    Document parse = Jsoup.parse(value.string());
    try {
        if (mT == Document.class) {
            return (T) parse;
        }
        return AJsoupReader.deserialize(parse, (Class<T>) mT);
    } finally {
        value.close();
    }
}
项目:vscrawler    文件:XpathNode.java   
@Override
public SipNodes createOrGetModel() {
    if (model == null) {
        try {
            Document document = Jsoup.parse(getRawText(), getBaseUrl());
            if (document == null) {
                throw new RuntimeException();
            }
            model = new SipNodes(SIPNode.e(document));
        } catch (Exception e) {
            model = new SipNodes(SIPNode.t(getRawText()));
        }
    }
    return model;
}
项目:ripme    文件:PorncomixRipper.java   
@Override
public List<String> getURLsFromPage(Document doc) {
    List<String> result = new ArrayList<>();
        for (Element el : doc.select("div.single-post > div.gallery > dl > dt > a > img")) {
            String imageSource = el.attr("data-lazy-src");
            // We remove the .md from images so we download the full size image
            // not the thumbnail ones
                imageSource = imageSource.replaceAll("-\\d\\d\\dx\\d\\d\\d", "");
                result.add(imageSource);
            }
        return result;
}
项目:MS-Cyber-Security-Solutions    文件:DataReceived.java   
public Integer getData(String link,String searchWord)
    {
        int count = 0;
        try
        {
            Trie myTrie = new Trie();

            //Using the jsoup to read through each webpage.
            Document document = Jsoup.connect(link).get();

            //Reading only the paragraph tags from the specified link.
            Elements paragraph = document.select("p");
            TreeMap<String, Integer> frequencyData = new TreeMap<String, Integer>();

            //Looping through all the paragraphs
            for(Element para : paragraph)
            {
                String p = para.text();
                p = p.replaceAll("[,.!?:;()-]", "\\s");//removing all the punctuations and replacing with blank spaces.
                //for each of the above paragraph extract individual words and insert them in trie data structure.

                for (String word : p.split(" ")) {
                    if (StopWord.is(word.toLowerCase())) continue;
                    myTrie.insert(word.toLowerCase());
                    count = PageRanking.getCount(word, frequencyData) + 1 ;
                    frequencyData.put(word, count);
                }
            }
            int temp = PageRanking.getCount(searchWord, frequencyData);
            count = temp;
//          System.out.println(searchWord+" found in "+ link + " "+myTrie.search(searchWord));
//          System.out.println(searchWord+" occurred "+temp+" times ");
        }
        catch(Exception e)
        {
            e.printStackTrace();
        }
        return count;
    }
项目:crawling-framework    文件:ArticleExtractor.java   
public static HttpArticleParseResult extractArticleWithDetails(String html, String url, HttpSource source, String publishedHint) {
    Document document = Jsoup.parse(html, url);
    HttpArticleParseResult result = new HttpArticleParseResult();
    HttpArticle article = new HttpArticle();
    article.setUrl(url);
    article.setSource(source.getUrl());
    article.setAppIds(source.getAppIds());
    article.setCategories(source.getCategories());

    List<String> ldJsons = JsonLdParser.extractJsonLdParts(document);
    JsonLdParser.JsonLdArticle ldJsonArticle = JsonLdParser.parse(ldJsons);

    List<MatchedString> titles = extractTitlesWithJsoup(document, ldJsonArticle, source);
    article.setTitle(titles.stream().map(MatchedString::getValue).collect(Collectors.joining("\n")));
    result.setTitleMatches(titles.stream().map(MatchedString::getMatch).collect(Collectors.toList()));

    List<MatchedString> texts = extractTextsWithJsoup(document, source);
    article.setText(texts.stream()
            .map(MatchedString::getValue)
            .map(t -> TextFilters.normalizeText(t, source.getTextNormalizers()))
            .collect(Collectors.joining("\n")));
    result.setTextMatches(texts.stream().map(MatchedString::getMatch).distinct().collect(Collectors.toList()));

    List<MatchedDate> publicationDates = extractPublicationDates(html, document, ldJsonArticle, source, publishedHint);
    MatchedDate publicationDate = publicationDates.stream().filter(d -> d.getDate() != null).findFirst().orElse(null);
    article.setPublished(publicationDate != null ? publicationDate.getDate() : null);
    result.setPublishedPattern(publicationDate != null ? publicationDate.getPattern() : null);
    List<String> publishedTexts = publicationDate != null ?
            Lists.newArrayList(publicationDate.getValue()) : publicationDates.stream().map(MatchedDate::getValue).collect(Collectors.toList());
    result.setPublishedTexts(publishedTexts);
    List<String> publishedMatches = publicationDate != null ?
            Lists.newArrayList(publicationDate.getMatch()) : publicationDates.stream().map(MatchedDate::getMatch).collect(Collectors.toList());

    result.setPublishedMatches(publishedMatches);

    result.setArticle(article);
    return result;
}
项目:PicKing    文件:Aitaotu.java   
@Override
public String getContentNext(String baseUrl, String currentUrl, byte[] result) throws UnsupportedEncodingException {
    Document document = Jsoup.parse(new String(result, "utf-8"));
    Elements elements = document.select("#pageNum a:containsOwn(下一页)");
    if (elements.size() > 0)
        return baseUrl + elements.get(0).attr("href");
    return "";
}
项目:AlipayOrdersSupervisor-GUI    文件:ApsvTimerTask.java   
private ArrayList<ApsvOrder> findOrders(String html) {
    //logger.info("Html: {}", html);
    ArrayList<ApsvOrder> orders = new ArrayList<>();

    Document doc = Jsoup.parse(html);

    Element ordersForm = doc.getElementById("J-submit-form");
    if (ordersForm == null) {
        logger.error("Cannot find order list form, maybe cookie expires");
        // 标记task status为异常
        // TODO 弹窗提醒cookie异常
        RunTasksModel.getInstance().MarkTaskException(task.id);
        return orders;
    }

    Elements tableBody = doc.select("#tradeRecordsIndex>tbody");
    Elements orderRows = tableBody.select("tr");

    orderRows.forEach(row -> {
        Elements timeNodes = row.select("td.time p");
        String[] orderNoData = row.select("td.tradeNo p").text().split("\\|");
        ApsvOrder order = new ApsvOrder(){
            {
                taskId = task.id;
                time = timeNodes.get(0).text() + " " + timeNodes.get(timeNodes.size() - 1).text();
                description = row.select(".memo-info").text();
                memo = row.select("td.memo p").text();
                tradeNo = orderNoData.length > 1 ? orderNoData[1].split(":")[1] : orderNoData[0].split(":")[1];
                username = Unicode.unicodeToString(row.select("td.other p").text());
                amount = Float.parseFloat(row.select("td.amount span").text().replaceAll("\\s+", ""));
                status = row.select("td.status p").text();
            }
        };
        order.sig = Order.Sign(order, task.pushSecret);
        orders.add(order);
    });
    return orders;
}
项目:PartyBuildingStudies    文件:NewsPresenter.java   
@Override
public void onHandleParseHTML(final String url) {
    mView.showLoading(true);
    Observable.create(new ObservableOnSubscribe<ArrayList<ArticleItem>>() {
        @Override
        public void subscribe(ObservableEmitter<ArrayList<ArticleItem>> e) throws Exception {
            ArrayList<ArticleItem> list = new ArrayList<>();
            Document doc = Jsoup.connect(url).get();
            Elements ul = doc.getElementsByClass("list_line");
            for (Element u : ul) {
                Elements li = u.getElementsByTag("li");
                for (Element l : li) {
                    String text = l.getElementsByTag("a").text();
                    String href = l.getElementsByTag("a").attr("href");
                    String time = l.getElementsByTag("span").text();
                    list.add(new ArticleItem(text, href, time));
                }
            }
            e.onNext(list);
        }
    })
            .subscribeOn(Schedulers.io())
            .observeOn(AndroidSchedulers.mainThread())
            .subscribe(new Consumer<ArrayList<ArticleItem>>() {
                @Override
                public void accept(@NonNull ArrayList<ArticleItem> articleItems) throws Exception {
                    mView.showList(articleItems);
                    mView.showLoading(false);
                }
            });
}
项目:NTPaprEng    文件:PaperWebPage.java   
private int parseVolum(final Document dom) {

        final Elements volum = dom.select(VOLUM_CSS_SELECTOR);

        try {
            return Integer.parseInt(volum.text().substring(VOLUM_TEXT_OFFSET));
        } catch (Exception e) {
            return 0;
        }
    }
项目:ripme    文件:FuraffinityRipper.java   
@Override
public List<String> getURLsFromPage(Document page) {
    List<String> urls = new ArrayList<>();
    Elements urlElements = page.select("figure.t-image > b > u > a");
    for (Element e : urlElements) {
        urls.add(getImageFromPost(urlBase + e.select("a").first().attr("href")));
    }
    return urls;
}