-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathredditcrawler.hs
110 lines (94 loc) · 3.46 KB
/
redditcrawler.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
module Main where
import Control.Lens ((^.))
import Control.Monad (mzero)
import Data.Aeson (FromJSON, Object, parseJSON, (.:))
import qualified Data.Aeson as Aeson
import qualified Data.ByteString.Lazy as BSL
import qualified Data.Csv as Csv
import Data.Text (Text)
import GHC.Generics (Generic)
import qualified Network.Wreq as W
import qualified Network.Wreq.Session as WS
import Options.Applicative (Parser, ParserInfo, argument, execParser,
fullDesc, header, help, helper, info,
metavar, progDesc, str, (<*>), (<>))
-- reddit crawler
main :: IO ()
main = do
-- run the options parser over the cli arguments
opts <- execParser optsParserInfo
-- make the call to reddit
r <- WS.withSession getRedditList
-- access the data from reddit
let redditListing = r ^. W.responseBody
-- get the top 10 listings from the data from reddit
let top10 = map rlidatas . take 10 . Main.children $ datas redditListing
-- encode the top 10 listings into a csv format
let csvContents = Csv.encodeDefaultOrderedByName top10
-- write the csv formatted data to our output file
BSL.writeFile (outputFilename opts) csvContents
-- Structures matching the json response from reddit
data RedditListing = RedditListing
{ kind :: Text
, datas :: RedditListingData
} deriving (Show)
-- instance for Aeson to decode JSON into this data structure
instance FromJSON RedditListing where
parseJSON (Aeson.Object v) =
RedditListing <$> v .: "kind"
<*> v .: "data"
parseJSON _ = mzero
data RedditListingData = RedditListingData
{ modhash :: Text
, children :: [RedditListingItem]
, after :: Text
, before :: Maybe Text
} deriving (Show, Generic)
instance FromJSON RedditListingData
data RedditListingItem = RedditListingItem
{ rlikind :: Text
, rlidatas :: RedditListingItemData
} deriving (Show)
instance FromJSON RedditListingItem where
parseJSON (Aeson.Object v) =
RedditListingItem <$> v .: "kind"
<*> v .: "data"
parseJSON _ = mzero
data RedditListingItemData = RedditListingItemData
{ title :: Text
, subreddit :: Text
, url :: Text
, permalink:: Text
} deriving (Show, Generic)
instance FromJSON RedditListingItemData
-- Instances for turning RedditListingItemData data type into csv rows
-- ToNamedRecord figures out header names from record element names
instance Csv.ToNamedRecord RedditListingItemData
-- DefaultOrdered uses the order of elements in the record for the csv
-- column ordering
instance Csv.DefaultOrdered RedditListingItemData
-- Make a request of reddit decoding the body to a RedditListing
getRedditList :: WS.Session -> IO (W.Response RedditListing)
getRedditList sess = do
r <- WS.get sess "https://reddit.com/hot.json"
W.asJSON r
-- structure to hold cli arguments
data Options = Options
{ outputFilename :: String
}
-- Parser for cli arguments
optsParser :: Parser Options
optsParser = Options
<$> argument str
( metavar "FILENAME"
<> help "File to output to"
)
-- Adding program help text to the parser
optsParserInfo :: ParserInfo Options
optsParserInfo = info (helper <*> optsParser)
( fullDesc
<> progDesc "The worst reddit client"
<> header "redditcrawler - a bad reddit client"
)