22
33import java .io .FileWriter ;
44import java .io .IOException ;
5- import java .text .SimpleDateFormat ;
65import java .util .ArrayList ;
7- import java .util .Date ;
86import java .util .List ;
97
108import org .baeldung .reddit .util .UserAgentInterceptor ;
2018public class RedditDataCollector {
2119 public static final String TRAINING_FILE = "src/main/resources/train.csv" ;
2220 public static final String TEST_FILE = "src/main/resources/test.csv" ;
21+ public static final int LIMIT = 100 ;
22+ public static final Long YEAR = 31536000L ;
2323 private final Logger logger = LoggerFactory .getLogger (getClass ());
2424
25- private String postAfter ;
25+ private Long timestamp ;
2626 private final RestTemplate restTemplate ;
2727 private final String subreddit ;
28- private final int minScore ;
2928
3029 public RedditDataCollector () {
3130 restTemplate = new RestTemplate ();
3231 final List <ClientHttpRequestInterceptor > list = new ArrayList <ClientHttpRequestInterceptor >();
3332 list .add (new UserAgentInterceptor ());
3433 restTemplate .setInterceptors (list );
35- subreddit = "all" ;
36- minScore = 4 ;
34+ subreddit = "java" ;
3735 }
3836
3937 public RedditDataCollector (String subreddit , int minScore ) {
@@ -42,35 +40,30 @@ public RedditDataCollector(String subreddit, int minScore) {
4240 list .add (new UserAgentInterceptor ());
4341 restTemplate .setInterceptors (list );
4442 this .subreddit = subreddit ;
45- this .minScore = minScore ;
4643 }
4744
4845 public void collectData () {
49- final int limit = 100 ;
5046 final int noOfRounds = 80 ;
47+ timestamp = System .currentTimeMillis () / 1000 ;
5148 try {
5249 final FileWriter writer = new FileWriter (TRAINING_FILE );
5350 for (int i = 0 ; i < noOfRounds ; i ++) {
54- getPosts (limit , writer );
51+ getPosts (writer );
5552 }
5653 writer .close ();
5754
5855 final FileWriter testWriter = new FileWriter (TEST_FILE );
59- getPosts (limit , testWriter );
56+ getPosts (testWriter );
6057 testWriter .close ();
6158 } catch (final Exception e ) {
6259 logger .error ("write to file error" , e );
6360 }
6461 }
6562
66- // ==== private
67-
68- private void getPosts (int limit , FileWriter writer ) {
69- String fullUrl = "http://www.reddit.com/r/" + subreddit + "/new.json?limit=" + limit ;
70- if (postAfter != null ) {
71- fullUrl += "&count=" + limit + "&after=" + postAfter ;
72- }
63+ // ==== Private
7364
65+ private void getPosts (FileWriter writer ) {
66+ final String fullUrl = "http://www.reddit.com/r/" + subreddit + "/search.json?sort=new&q=timestamp:" + (timestamp - YEAR ) + ".." + timestamp + "&restrict_sr=on&syntax=cloudsearch&limit=" + LIMIT ;
7467 try {
7568 final JsonNode node = restTemplate .getForObject (fullUrl , JsonNode .class );
7669 parseNode (node , writer );
@@ -82,22 +75,19 @@ private void getPosts(int limit, FileWriter writer) {
8275 }
8376
8477 private void parseNode (JsonNode node , FileWriter writer ) throws IOException {
85- postAfter = node .get ("data" ).get ("after" ).asText ();
86- System .out .println (postAfter );
8778 String line ;
88- String category ;
8979 List <String > words ;
90- final SimpleDateFormat df = new SimpleDateFormat ( "HH" ) ;
80+ int score ;
9181 for (final JsonNode child : node .get ("data" ).get ("children" )) {
92- category = ( child .get ("data" ).get ("score" ).asInt () < minScore ) ? "bad" : "good" ;
82+ score = child .get ("data" ).get ("score" ).asInt ();
9383 words = Splitter .onPattern ("\\ W" ).omitEmptyStrings ().splitToList (child .get ("data" ).get ("title" ).asText ());
94- final Date date = new Date ( child .get ("data" ).get ("created_utc" ).asLong () * 1000 );
84+ timestamp = child .get ("data" ).get ("created_utc" ).asLong ();
9585
96- line = category + ";" ;
97- line += df . format ( date ) + ";" ;
86+ line = score + ";" ;
87+ line += timestamp + ";" ;
9888 line += words .size () + ";" + Joiner .on (' ' ).join (words ) + ";" ;
9989 line += child .get ("data" ).get ("domain" ).asText () + "\n " ;
100-
90+ System . out . println ( line );
10191 writer .write (line );
10292 }
10393 }
0 commit comments