Javaã®HTMLãã¼ãµã¼è²ã
ãããã¼ã¸ã®ç»åã¨ãªã³ã¯ã®URLããã¹ã¦åå¾ãããµã³ãã«ãæ¸ãã¦ã¿ãã
大ä½ä¸ç·ã ããä»ã®ãµã³ãã«ãå¾ã§æ¸ããâ¦
import java.io.IOException; import java.net.MalformedURLException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class SampleJsoup { public static void main(String[] args) { String url = "ãã¼ãã¼ããã"; try { Document doc = Jsoup.connect(url).get(); Elements links = doc.select("a[href]"); Elements imgs = doc.select("[src]"); for(Element img : imgs){ if(img.tagName().equals("img")){ System.out.println(String.format("%s : <%s>",img.tagName(),img.attr("abs:src"))); } } for(Element link : links){ System.out.println(String.format("%s : <%s>",link.tagName(),link.attr("abs:href"))); } } catch (MalformedURLException e) { // TODO Auto-generated catch bklock e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
import java.io.InputStream; import java.net.URL; import org.cyberneko.html.parsers.DOMParser; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; public class SampleCyberNekoParser { public static void main(String[] args) throws Exception{ URL url = new URL("ãã¼ãã¼ããã"); DOMParser parser = new DOMParser(); InputStream is = url.openConnection().getInputStream(); try{ parser.parse(new InputSource(is)); }finally{ is.close(); } Document doc = parser.getDocument(); NodeList linkList = doc.getElementsByTagName("A"); NodeList imgList = doc.getElementsByTagName("IMG"); for(int i = 0;i < linkList.getLength(); i++){ Element elm = (Element) linkList.item(i); System.out.println(String.format("%s : <%s>", elm.getTagName(),elm.getAttribute("href"))); } for(int i = 0;i < imgList.getLength(); i++){ Element elm = (Element) imgList.item(i); System.out.println(String.format("%s : <%s>", elm.getTagName(),elm.getAttribute("src"))); } } }
import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.List; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.HTMLElementName; import net.htmlparser.jericho.Source; public class SampleJerichoHTMLParser { public static void main(String[] args) throws MalformedURLException, IOException { Source source = new Source(new URL("ãã¼ãã¼ããã")); List<Element> linkList = source.getAllElements(HTMLElementName.A); List<Element> imgList = source.getAllElements(HTMLElementName.IMG); for(Element link : linkList){ System.out.println(String.format("%s : <%s>" , link.getName(), link.getAttributeValue("href"))); } for(Element img : imgList){ System.out.println(String.format("%s : <%s>" , img.getName(), img.getAttributeValue("src"))); } } }
import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.List; import org.htmlcleaner.CleanerProperties; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; public class SampleHTMLCleaner { public static void main(String[] args) throws MalformedURLException, IOException { CleanerProperties props = new CleanerProperties(); HtmlCleaner htmlCleaner = new HtmlCleaner(props); TagNode tagNode = htmlCleaner.clean(new URL("ãã¼ãã¼ããã")); List<TagNode> linkList = tagNode.getElementListByName("a", true); List<TagNode> imgList = tagNode.getElementListByName("img", true); for(TagNode link : linkList){ System.out.println(String.format("%s : <%s>",link.getName(),link.getAttributeByName("href"))); } for(TagNode img : imgList){ System.out.println(String.format("%s : <%s>",img.getName(),img.getAttributeByName("src"))); } } }