C#ã§ã¹ã¯ã¬ã¤ãã³ã°
2009/12/23 è¿½è¨ æä¸æ®µã«åèæç®ã¨ãã¦ãããã§ç´¹ä»ãã以å¤ã®æ¹æ³ã¸ã®ãªã³ã¯ãæãã¾ãã
C#ã§HTMLãèªã¿è¾¼ãã§ãHTMLããæçãªæ
å ±ãåãåºãããã¨ãã£ã¦ããã¾ãããã
ã§ãæ£è¦è¡¨ç¾ã¨ãã§æãåºãã®ã£ã¦ã«ãã³æªããã¨æãã¾ãã
ããã§ããInfoPath SDKã«ã¤ãã¦ãHTMLtoXHTMLã¨ããCOMã³ã³ãã¼ãã³ããã使ã£ã¦XPathå¼ã§æ
å ±ãåå¾ãã¾ãã
ã¤ã³ã¹ãã¼ã«æ¹æ³ã¯âã®URLãåç
§ãã¦ãã ããã
C#でスクレイピング - DENKEN
以ä¸ã«å®éã«åãã½ã¼ã¹ã³ã¼ããè²¼ã£ã¦ããã¾ããLINQã¨ã使ããªã.Net Framework 2.0対å¿ã§ãã
using System.Xml; using HTML2XHTMLLib; namespace hogehoge { public class Hoge { /// <summary>ã³ã³ã¹ãã©ã¯ã¿</summary> /// <param name="contents">ãã¼ã¸å 容(HTML)</param> public Hoge(string contents) { if((contents ?? "").Length == 0) return; XmlElement root; XmlNamespaceManager nsmgr; XHTMLUtilities util = new XHTMLUtilities(); { // HTMLãXHTMLã«å¤æãã string xhtml = util.convertToXHTML(contents); // ããããXMLã¨ãã¦èªã¿è¾¼ãã«ã¯ä¸å®å ¨ãªããã微調æ´ãããå¿ è¦ããã xhtml = xhtml.Replace("\"checked=", "\" checked="); // // XMLã¨ãã¦èªã¿è¾¼ã XmlDocument xdoc = new XmlDocument(); xdoc.LoadXml(xhtml); // ã«ã¼ããã¼ãã®åå¾ root = xdoc.DocumentElement; // ãã¼ã ã¹ãã¼ã¹ã®ä½æ nsmgr = new XmlNamespaceManager(xdoc.NameTable); // ãªãã§ããããããå¿ ãä½æãã nsmgr.AddNamespace("ns", "http://www.w3.org/1999/xhtml"); } // <title>ã¿ã°ã®ä¸ã«æ¸ããã¦ããããã¹ããåå¾ // Namespaceãç¡ãã¦ãå¿ ãæå®ãã string title = root.SelectSingleNode("//ns:title", nsmgr).InnerText; // <a href="hoge...">ãªã³ã¯å ãhogeã§å§ã¾ããªã³ã¯ãå ¨ã¦åå¾ foreach(XmlNode node in root.SelectNodes("//ns:a[starts-with(@href, 'hoge')]", nsmgr)) { string link = node.InnerText; } // XPathå¼ã®æ¸ãæ¹ã¯ä»ã®ãã¼ã¸ãåç §ãã¦ãã ãã XmlNode node2 = root.SelectSingleNode("./ns:body/ns:div[@class='hoge']", nsmgr); } } }
使ãå´ã¯ããããªæãã§ã
hogehoge.Hoge hoge = new hogehoge.Hoge("<html><head><title>ã¿ã¤ãã«</title></head><body><a href=\"hogehoge\">ãªã³ã¯å ã¸</a><div class=\"hoge\"></div></body></html>");
é å¸å ã§ä½¿ã£ã¦ãããã«ã¯ãhtml2xhtml.dllãé ãã ããããã¡ãªããã§ãhtml2xhtml.dllããããã£ã¬ã¯ããªã§æ¬¡ã®ã³ãã³ããæã£ã¦ãããå¿ è¦ãããããã§ãã
> regsvr32 html2xhtml.dll