solver-benchmarks/HackageBenchmark.hs

{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE ScopedTypeVariables #-}

module HackageBenchmark (
    hackageBenchmarkMain

-- Exposed for testing:
  , CabalResult(..)
  , isSignificantTimeDifference
  , combineTrialResults
  , isSignificantResult
  , shouldContinueAfterFirstTrial
  ) where

import Control.Concurrent.Async (concurrently)
import Control.Monad (forM, replicateM, unless, when)
import qualified Data.ByteString as BS
import Data.List (nub, unzip4)
import Data.Maybe (isJust, catMaybes)
import Data.String (fromString)
import Data.Function ((&))
import Data.Time (NominalDiffTime, diffUTCTime, getCurrentTime)
import qualified Data.Vector.Unboxed as V
import Options.Applicative
import Statistics.Sample (mean, stdDev, geometricMean)
import Statistics.Test.MannWhitneyU ( PositionTest(..), TestResult(..)
                                    , mannWhitneyUCriticalValue
                                    , mannWhitneyUtest)
import Statistics.Types (PValue, mkPValue)
import System.Directory (getTemporaryDirectory, createDirectoryIfMissing)
import System.Environment (getEnvironment)
import System.Exit (ExitCode(..), exitWith, exitFailure)
import System.FilePath ((</>))
import System.IO ( BufferMode(LineBuffering), hPutStrLn, hSetBuffering, stderr
                 , stdout)
import System.Process ( StdStream(CreatePipe), CreateProcess(..), callProcess
                      , createProcess, readProcess, shell, waitForProcess, proc, readCreateProcessWithExitCode )
import Text.Printf (printf)

import qualified Data.Map.Strict as Map

import Distribution.Package (PackageName, mkPackageName, unPackageName)

data Args = Args {
    argCabal1                      :: FilePath
  , argCabal2                      :: FilePath
  , argCabal1Flags                 :: [String]
  , argCabal2Flags                 :: [String]
  , argPackages                    :: [PackageName]
  , argMinRunTimeDifferenceToRerun :: Double
  , argPValue                      :: PValue Double
  , argTrials                      :: Int
  , argConcurrently                :: Bool
  , argPrintTrials                 :: Bool
  , argPrintSkippedPackages        :: Bool
  , argTimeoutSeconds              :: Int
  }

data CabalTrial = CabalTrial NominalDiffTime CabalResult

data CabalResult
  = Solution
  | NoInstallPlan
  | BackjumpLimit
  | Unbuildable
  | UnbuildableDep
  | ComponentCycle
  | ModReexpIssue
  | PkgNotFound
  | Timeout
  | Unknown
  deriving (Eq, Show)

hackageBenchmarkMain :: IO ()
hackageBenchmarkMain = do
  hSetBuffering stdout LineBuffering
  args@Args {..} <- execParser parserInfo
  checkArgs args
  printConfig args
  pkgs <- getPackages args
  putStrLn ""

  let concurrently' :: IO a -> IO b -> IO (a, b)
      concurrently' | argConcurrently = concurrently
                    | otherwise       = \ma mb -> do { a <- ma; b <- mb; return (a, b) }

  let -- The maximum length of the heading and package names.
      nameColumnWidth :: Int
      nameColumnWidth =
          maximum $ map length $ "package" : map unPackageName pkgs

  -- create cabal runners
  runCabal1 <- runCabal argTimeoutSeconds CabalUnderTest1 argCabal1 argCabal1Flags
  runCabal2 <- runCabal argTimeoutSeconds CabalUnderTest2 argCabal2 argCabal2Flags

  -- When the output contains both trails and summaries, label each row as
  -- "trial" or "summary".
  when argPrintTrials $ putStr $ printf "%-16s " "trial/summary"
  putStrLn $
      printf "%-*s %-14s %-14s %11s %11s %11s %11s %11s"
             nameColumnWidth "package" "result1" "result2"
             "mean1" "mean2" "stddev1" "stddev2" "speedup"

  speedups :: [Double] <- fmap catMaybes $ forM pkgs $ \pkg -> do
    let printTrial msgType result1 result2 time1 time2 =
            putStrLn $
            printf "%-16s %-*s %-14s %-14s %10.3fs %10.3fs"
                   msgType nameColumnWidth (unPackageName pkg)
                   (show result1) (show result2)
                   (diffTimeToDouble time1) (diffTimeToDouble time2)

    (CabalTrial t1 r1, CabalTrial t2 r2) <- runCabal1 pkg `concurrently'` runCabal2 pkg

    if not $
       shouldContinueAfterFirstTrial argMinRunTimeDifferenceToRerun t1 t2 r1 r2
    then do
      when argPrintSkippedPackages $
         if argPrintTrials
         then printTrial "trial (skipping)" r1 r2 t1 t2
         else putStrLn $ printf "%-*s (first run times were too similar)"
                                nameColumnWidth (unPackageName pkg)
      return Nothing
    else do
      when argPrintTrials $ printTrial "trial" r1 r2 t1 t2
      (ts1, ts2, rs1, rs2) <- (unzip4 . ((t1, t2, r1, r2) :) <$>)
                            . replicateM (argTrials - 1) $ do

        (CabalTrial t1' r1', CabalTrial t2' r2') <- runCabal1 pkg `concurrently'` runCabal2 pkg
        when argPrintTrials $ printTrial "trial" r1' r2' t1' t2'
        return (t1', t2', r1', r2')

      let result1 = combineTrialResults rs1
          result2 = combineTrialResults rs2
          times1 = V.fromList (map diffTimeToDouble ts1)
          times2 = V.fromList (map diffTimeToDouble ts2)
          mean1 = mean times1
          mean2 = mean times2
          stddev1 = stdDev times1
          stddev2 = stdDev times2
          speedup = mean1 / mean2

      when argPrintTrials $ putStr $ printf "%-16s " "summary"
      if isSignificantResult result1 result2
          || isSignificantTimeDifference argPValue ts1 ts2
      then putStrLn $
           printf "%-*s %-14s %-14s %10.3fs %10.3fs %10.3fs %10.3fs %10.3f"
                  nameColumnWidth (unPackageName pkg)
                  (show result1) (show result2) mean1 mean2 stddev1 stddev2 speedup
      else when (argPrintTrials || argPrintSkippedPackages) $
           putStrLn $
           printf "%-*s (not significant, speedup = %10.3f)" nameColumnWidth (unPackageName pkg) speedup

      -- return speedup value
      return (Just speedup)

  -- finally, calculate the geometric mean of speedups
  printf "Geometric mean of %d packages' speedups is %10.3f\n" (length speedups) (geometricMean (V.fromList speedups))

  where
    checkArgs :: Args -> IO ()
    checkArgs Args {..} = do
      let die msg = hPutStrLn stderr msg >> exitFailure
      unless (argTrials > 0) $ die "--trials must be greater than 0."
      unless (argMinRunTimeDifferenceToRerun >= 0) $
          die "--min-run-time-percentage-difference-to-rerun must be non-negative."
      unless (isSampleLargeEnough argPValue argTrials) $
          die "p-value is too small for the number of trials."

    printConfig :: Args -> IO ()
    printConfig Args {..} = do
      putStrLn "Comparing:"
      putStrLn $ "1: " ++ argCabal1 ++ " " ++ unwords argCabal1Flags
      callProcess argCabal1 ["--version"]
      putStrLn $ "2: " ++ argCabal2 ++ " " ++ unwords argCabal2Flags
      callProcess argCabal2 ["--version"]
      -- TODO: Print index state.
      putStrLn "Base package database:"
      callProcess "ghc-pkg" ["list"]

    getPackages :: Args -> IO [PackageName]
    getPackages Args {..} = do
      pkgs <-
          if null argPackages
          then do
            putStrLn $ "Obtaining the package list (using " ++ argCabal1 ++ ") ..."
            list <- readProcess argCabal1 ["list", "--simple-output"] ""
            return $ nub [mkPackageName n | n : _ <- words <$> lines list]
          else do
            putStrLn "Using given package list ..."
            return argPackages
      putStrLn $ "Done, got " ++ show (length pkgs) ++ " packages."
      return pkgs

data CabalUnderTest = CabalUnderTest1 | CabalUnderTest2

runCabal
    :: Int             -- ^ timeout in seconds
    -> CabalUnderTest  -- ^ cabal under test
    -> FilePath        -- ^ cabal
    -> [String]        -- ^ flags
    -> IO (PackageName  -> IO CabalTrial)  -- ^ testing function.
runCabal timeoutSeconds cabalUnderTest cabal flags = do
  tmpDir <- getTemporaryDirectory

  -- cabal directory for this cabal under test
  let cabalDir = tmpDir </> "solver-benchmarks-workdir" </> case cabalUnderTest of
          CabalUnderTest1 -> "cabal1"
          CabalUnderTest2 -> "cabal2"

  putStrLn $ "Cabal directory (for " ++ cabal ++ ") " ++ cabalDir
  createDirectoryIfMissing True cabalDir

  -- shell environment
  currEnv <- Map.fromList <$>  getEnvironment
  let thisEnv :: [(String, String)]
      thisEnv = Map.toList $ currEnv
          & Map.insert "CABAL_CONFIG" (cabalDir </> "config")
          & Map.insert "CABAL_DIR"     cabalDir

  -- Initialize the config file, whether or not it already exists
  runCabalCmdWithEnv cabalDir thisEnv ["user-config", "init", "--force"]

  -- Run cabal update
  putStrLn $ "Running cabal update (using " ++ cabal ++ ") ..."
  runCabalCmdWithEnv cabalDir thisEnv ["update"]

  -- return an actual runner
  return $ \pkg -> do
    ((exitCode, err), time) <- timeEvent $ do

      let timeout = "timeout --foreground -sINT " ++ show timeoutSeconds
          cabalCmd = unwords $
              [ cabal

              , "install"

                -- These flags prevent a Cabal project or package environment from
                -- affecting the install plan.
                --
                -- Note: we are somewhere in /tmp, hopefully there is no cabal.project on upper level
              , "--package-env=non-existent-package-env"

                -- --lib allows solving for packages with libraries or
                -- executables.
              , "--lib"

              , unPackageName pkg

              , "--dry-run"

                -- The test doesn't currently handle stdout, so we suppress it
                -- with silent. nowrap simplifies parsing the errors messages.
              , "-vsilent+nowrap"

              ]

               ++ flags

          cmd = (shell (timeout ++ " " ++ cabalCmd))
              { std_err = CreatePipe
              , env = Just thisEnv
              , cwd = Just cabalDir
              }

      -- TODO: Read stdout and compare the install plans.
      (_, _, Just errh, ph) <- createProcess cmd
      err <- BS.hGetContents errh
      (, err) <$> waitForProcess ph

    let exhaustiveMsg =
            "After searching the rest of the dependency tree exhaustively"
        result
          | exitCode == ExitSuccess                                                          = Solution
          | exitCode == ExitFailure 124                                                      = Timeout
          | fromString exhaustiveMsg `BS.isInfixOf` err                                       = NoInstallPlan
          | fromString "Backjump limit reached" `BS.isInfixOf` err                            = BackjumpLimit
          | fromString "none of the components are available to build" `BS.isInfixOf` err     = Unbuildable
          | fromString "Dependency on unbuildable" `BS.isInfixOf` err                         = UnbuildableDep
          | fromString "Dependency cycle between the following components" `BS.isInfixOf` err = ComponentCycle
          | fromString "Problem with module re-exports" `BS.isInfixOf` err                    = ModReexpIssue
          | fromString "There is no package named" `BS.isInfixOf` err                         = PkgNotFound
          | otherwise                                                                        = Unknown
    return (CabalTrial time result)
  where
    runCabalCmdWithEnv cabalDir thisEnv args = do
      (ec, uout, uerr) <- readCreateProcessWithExitCode (proc cabal args)
          { cwd = Just cabalDir
          , env = Just thisEnv
          }
          ""
      unless (ec == ExitSuccess) $ do
          putStrLn uout
          putStrLn uerr
          exitWith ec

isSampleLargeEnough :: PValue Double -> Int -> Bool
isSampleLargeEnough pvalue trials =
    -- mannWhitneyUCriticalValue, which can fail with too few samples, is only
    -- used when both sample sizes are less than or equal to 20.
    trials > 20 || isJust (mannWhitneyUCriticalValue (trials, trials) pvalue)

isSignificantTimeDifference :: PValue Double -> [NominalDiffTime] -> [NominalDiffTime] -> Bool
isSignificantTimeDifference pvalue xs ys =
  let toVector = V.fromList . map diffTimeToDouble
  in case mannWhitneyUtest SamplesDiffer pvalue (toVector xs) (toVector ys) of
       Nothing             -> error "not enough data for mannWhitneyUtest"
       Just Significant    -> True
       Just NotSignificant -> False

-- Should we stop after the first trial of this package to save time? This
-- function skips the package if the results are uninteresting and the times are
-- within --min-run-time-percentage-difference-to-rerun.
shouldContinueAfterFirstTrial :: Double
                              -> NominalDiffTime
                              -> NominalDiffTime
                              -> CabalResult
                              -> CabalResult
                              -> Bool
shouldContinueAfterFirstTrial 0                            _  _  _       _       = True
shouldContinueAfterFirstTrial _                            _  _  Timeout Timeout = False
shouldContinueAfterFirstTrial maxRunTimeDifferenceToIgnore t1 t2 r1      r2      =
    isSignificantResult r1 r2
 || abs (t1 - t2) / min t1 t2 >= realToFrac (maxRunTimeDifferenceToIgnore / 100)

isSignificantResult :: CabalResult -> CabalResult -> Bool
isSignificantResult r1 r2 = r1 /= r2 || not (isExpectedResult r1)

-- Is this result expected in a benchmark run on all of Hackage?
isExpectedResult :: CabalResult -> Bool
isExpectedResult Solution       = True
isExpectedResult NoInstallPlan  = True
isExpectedResult BackjumpLimit  = True
isExpectedResult Timeout        = True
isExpectedResult Unbuildable    = True
isExpectedResult UnbuildableDep = True
isExpectedResult ComponentCycle = True
isExpectedResult ModReexpIssue  = True
isExpectedResult PkgNotFound    = False
isExpectedResult Unknown        = False

-- Combine CabalResults from multiple trials. Ignoring timeouts, all results
-- should be the same. If they aren't the same, we returns Unknown.
combineTrialResults :: [CabalResult] -> CabalResult
combineTrialResults rs
  | r:_ <- rs
  , allEqual rs                          = r
  | allEqual [r | r <- rs, r /= Timeout] = Timeout
  | otherwise                            = Unknown
  where
    allEqual :: Eq a => [a] -> Bool
    allEqual xs = length (nub xs) == 1

timeEvent :: IO a -> IO (a, NominalDiffTime)
timeEvent task = do
  start <- getCurrentTime
  r <- task
  end <- getCurrentTime
  return (r, diffUTCTime end start)

diffTimeToDouble :: NominalDiffTime -> Double
diffTimeToDouble = fromRational . toRational

parserInfo :: ParserInfo Args
parserInfo = info (argParser <**> helper)
     ( fullDesc
    <> progDesc ("Find differences between two cabal commands when solving"
                   ++ " for all packages on Hackage.")
    <> header "hackage-benchmark" )

argParser :: Parser Args
argParser = Args
    <$> strOption
         ( long "cabal1"
        <> metavar "PATH"
        <> help "First cabal executable")
    <*> strOption
         ( long "cabal2"
        <> metavar "PATH"
        <> help "Second cabal executable")
    <*> option (words <$> str)
         ( long "cabal1-flags"
        <> value []
        <> metavar "FLAGS"
        <> help "Extra flags for the first cabal executable")
    <*> option (words <$> str)
         ( long "cabal2-flags"
        <> value []
        <> metavar "FLAGS"
        <> help "Extra flags for the second cabal executable")
    <*> option (map mkPackageName . words <$> str)
         ( long "packages"
        <> value []
        <> metavar "PACKAGES"
        <> help ("Space separated list of packages to test, or all of Hackage"
                   ++ " if unspecified"))
    <*> option auto
         ( long "min-run-time-percentage-difference-to-rerun"
        <> showDefault
        <> value 0.0
        <> metavar "PERCENTAGE"
        <> help ("Stop testing a package when the difference in run times in"
                   ++ " the first trial are within this percentage, in order to"
                   ++ " save time"))
    <*> option (mkPValue <$> auto)
         ( long "pvalue"
        <> showDefault
        <> value (mkPValue 0.05)
        <> metavar "DOUBLE"
        <> help ("p-value used to determine whether to print the results for"
                   ++ " each package"))
    <*> option auto
         ( long "trials"
        <> showDefault
        <> value 10
        <> metavar "N"
        <> help "Number of trials for each package")
    <*> switch
         ( long "concurrently"
        <> help "Run cabals concurrently")
    <*> switch
         ( long "print-trials"
        <> help "Whether to include the results from individual trials in the output")
    <*> switch
         ( long "print-skipped-packages"
        <> help "Whether to include skipped packages in the output")
    <*> option auto
         ( long "timeout"
        <> showDefault
        <> value 90
        <> metavar "SECONDS"
        <> help "Maximum time to run a cabal command, in seconds")