$ mkdir /KazMuzik/uscis.gov $ cd /KazMuzik/uscis.gov $ tar zxvpf .../nutch-0.9.tar.gz $ cd nutch-0.9 $ vi conf/nutch-site.xml $ mkdir seeds $ vi seeds/seeds.txt $ cat seeds/seeds.txt http://www.uscis.gov/portal/site/uscis $ vi conf/crawl-urlfilter.txt $ cat conf/crawl-urlfilter.txt -^(file|ftp|mailto): -\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$ # -[?*!@=] -.*(/.+?)/.*?\1/.*?\1/ +^http://([a-z0-9]*\.)*uscis.gov/ -. $ export NUTCH_JAVA_HOME=/usr/java/jdk $ nohup bin/nutch crawl seeds & $ tail -f nohup.out crawl started in: crawl-20080705105148 rootUrlDir = seeds threads = 10 depth = 5 Injector: starting ... merging indexes to: crawl-20080705105148/index Adding crawl-20080705105148/indexes/part-00000 done merging crawl finished: crawl-20080705105148 ^C $ bin/nutch readseg -list -dir crawl-20080705105148/segments | sort 20080705105152 1 2008-07-05T10:51:56 2008-07-05T10:51:56 1 1 20080705105202 33 2008-07-05T10:52:06 2008-07-05T10:52:42 39 24 20080705105254 248 2008-07-05T10:52:57 2008-07-05T10:58:28 287 184 20080705105910 1146 2008-07-05T10:59:22 2008-07-05T11:29:36 1191 903 20080705113222 2998 2008-07-05T11:32:29 2008-07-05T12:53:59 3046 2196 NAME GENERATED FETCHER START FETCHER END FETCHED PARSED $ |
/usr/local に、Tomcat 6.0.16 がインストールされていたので、deploy してみます。
$ cp nutch-0.9.war /usr/loca/tomcat/webapps/uscis.war $ /usr/local/tomcat/bin/shutdown.sh $ vi /usr/local/tomcat/webapps/uscis/WEB-INF/classes/nutch-site.xml $ cat /usr/local/tomcat/webapps/uscis/WEB-INF/classes/nutch-site.xml <?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <name>searcher.dir</name> <value>/KazMuzik/uscis.gov/nutch-0.9/crawl-20080705105148</value> </property> </configuration> $ /usr/local/tomcat/bin/startup.sh $ |
Tags: computer_technology, immigration