Skip to content

Commit

Permalink
Allow concurrent indexing in glean index cpp-cmake (#271)
Browse files Browse the repository at this point in the history
Summary:
This adds support for concurrent indexing based on the existing worklist framework. There are some shenanigans
related to progress reporting which mostly exist to make indexing LLVM nicer (script coming up).

The other major addition is the index-llvm.sh script which automates indexing LLVM source distributions using the parallel indexer - try the example suggested in index-llvm.sh --help. I'm not sure where the script should live. On my server indexing takes about 10 min with -j16.

Note that this includes #269 - we don't have stacked PRs so that one should probably be landed first.

Pull Request resolved: #271

Reviewed By: simonmar

Differential Revision: D39420798

Pulled By: pepeiborra

fbshipit-source-id: 2dc81e20a20fb522e42557944cd4871f72099775
  • Loading branch information
rleshchinskiy authored and facebook-github-bot committed Sep 25, 2022
1 parent 718ab66 commit 0930b0f
Show file tree
Hide file tree
Showing 7 changed files with 419 additions and 65 deletions.
11 changes: 11 additions & 0 deletions glean.cabal.in
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,16 @@ library client-cpp
build-depends:
glean:rts

library interprocess
import: fb-haskell, deps
visibility: public
hs-source-dirs: glean/interprocess/hs
exposed-modules:
Glean.Interprocess.Counters
Glean.Interprocess.Worklist
build-depends:
glean:client-cpp

library schema
import: fb-haskell, fb-cpp, deps
visibility: public
Expand Down Expand Up @@ -995,6 +1005,7 @@ library indexers
glean:client-hs-local,
glean:db,
glean:handler,
glean:interprocess,
glean:lib,
glean:lsif,
glean:stubs,
Expand Down
232 changes: 170 additions & 62 deletions glean/lang/clang/Glean/Indexer/Cpp.hs
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@ module Glean.Indexer.Cpp
( indexerWith, indexer, indexerNoDeriv, Clang(..)
, findExecutableRecursive ) where

import Control.Concurrent (threadDelay)
import Control.Concurrent.Async
import Control.Exception
import Control.Monad
import Data.Proxy
import Options.Applicative
import qualified System.Console.ANSI as ANSI
import System.Directory
import System.Environment
import System.Exit
Expand All @@ -23,6 +26,7 @@ import System.IO
import System.Process
import Thrift.Protocol (deserializeGen)
import Thrift.Protocol.Compact (Compact)
import Util.List (chunk)

import Facebook.Fb303
import Facebook.Service
Expand All @@ -32,6 +36,7 @@ import Glean.Indexer
import Glean.LocalOrRemote ( BackendKind(..),
LocalOrRemote(..), serializeInventory )
import Glean.Util.Service
import qualified Glean.Interprocess.Worklist as Worklist

import qualified Data.ByteString as BS
import qualified Glean.Handler as GleanHandler
Expand All @@ -42,7 +47,9 @@ data Clang = Clang
, clangDeriveBin :: Maybe FilePath -- ^ path to @clang-derive@ binary
, clangCompileDBDir :: Maybe FilePath
-- ^ (optional) path to pre-existing @compile_commands.json@
, clangJobs :: Int -- ^ number of indexers to run concurrently
, clangVerbose :: Bool -- ^ display debugging information
, clangProgress :: Bool -- ^ display indexing progress
} deriving Show

options :: Parser Clang
Expand All @@ -56,10 +63,18 @@ options = do
clangCompileDBDir <- optional $ strOption $
long "cdb" <>
help "path to a directory containing an existing compile_commands.json file"
clangJobs <- option auto $
short 'j' <>
long "jobs" <>
value 1 <>
help "run N indexers in parallel"
clangVerbose <- switch $
short 'v' <>
long "verbose" <>
help "Enable verbose logging from subprocesses"
clangProgress <- switch $
long "progress" <>
help "Display indexing progress even in verbose mode"
return Clang{..}

-- | Standard indexer, that also runs the deriver
Expand All @@ -77,99 +92,192 @@ indexerWith deriveToo = Indexer {
indexerShortName = "cpp-cmake",
indexerDescription = "Index C++ code with CMake (via Clang)",
indexerOptParser = options,
indexerRun = \Clang{..} backend repo IndexerParams{..} -> do
indexerRun = \clang@Clang{..} backend repo IndexerParams{..} -> do
-- indexing
let tmpDir = indexerOutput
inventoryFile = tmpDir </> "inventory.data"
indexerData = tmpDir </> "indexer.data"
generateInventory backend repo inventoryFile
compileDBDir <-
case clangCompileDBDir of
Nothing -> cmake clangVerbose indexerRoot tmpDir >> return tmpDir
Just dir -> return dir
index clangVerbose clangIndexBin inventoryFile
indexerRoot compileDBDir indexerData
indexerData <-
index clang inventoryFile indexerRoot compileDBDir indexerOutput
writeToDB backend repo indexerData

-- deriving
when deriveToo $
derive clangVerbose clangDeriveBin backend repo
}

where generateInventory backend repo outFile =
serializeInventory backend repo >>= BS.writeFile outFile
where
generateInventory backend repo outFile =
serializeInventory backend repo >>= BS.writeFile outFile

cmake verbose srcDir tmpDir = withExe "cmake" Nothing $ \cmakeBin ->
spawnAndConcurrentLog verbose cmakeBin
[ "-DCMAKE_EXPORT_COMPILE_COMMANDS=1"
, "-S", srcDir
, "-B", tmpDir
]

cmake verbose srcDir tmpDir = withExe "cmake" Nothing $ \cmakeBin ->
spawnAndConcurrentLog verbose cmakeBin
[ "-DCMAKE_EXPORT_COMPILE_COMMANDS=1"
, "-S", srcDir
, "-B", tmpDir
index Clang{..} inventory srcDir buildDir tmpDir =
withExe "clang-index" clangIndexBin $ \clangIndex -> do
let args =
[ "-cdb_dir", buildDir
, "-cdb_target", "all"
, "-root", srcDir
, "--inventory", inventory
, "-logtostderr"
]

index verbose indexBin inventory srcDir buildDir outFile =
withExe "clang-index" indexBin $ \clangIndex -> do
let args = [ "-cdb_dir", buildDir
, "-cdb_target", "all"
, "-root", srcDir
, "-dump", outFile
, "--inventory", inventory
, "-logtostderr"
]
spawnAndConcurrentLog verbose clangIndex args

writeToDB backend repo dataFile = do
dat <- BS.readFile dataFile
case deserializeGen (Proxy :: Proxy Compact) dat of
Left parseError -> error parseError
Right batch -> sendBatch backend repo batch

derive verbose deriveBin backend repo =
withExe "clang-derive" deriveBin $ \clangDerive -> do
let go service = spawnAndConcurrentLog verbose clangDerive
[ "--repo", showRepo repo
, "--service", service
]
case backendKind backend of
BackendEnv env -> do
fb303 <- newFb303 "gleandriver"
let state = GleanHandler.State fb303 env
withBackgroundFacebookService
(GleanHandler.fb303State state)
(GleanHandler.handler state)
CppServer.defaultOptions
$ \server ->
go ("localhost:" <> show (CppServer.serverPort server))
BackendThrift thrift -> do
let clientConfig = thriftBackendClientConfig thrift
go $ serviceToString (clientConfig_serv clientConfig)
-- get the total number of source files
sources <- do
let pargs = args ++ ["--print_sources_count"]
s <- readProcess clangIndex pargs ""
case reads s of
[(sources,"")] -> return sources
_ -> error $ unwords (clangIndex:pargs)
++ " produced unexpect output \"" ++ s ++ "\""

case sources of
0 -> do
-- TODO: should this be an error?
putStrLn "No source files to index"
return []
_ ->
-- set up worklist
let ranges =
map (\(i,n) -> Worklist.Range i (i+n)) $ chunk clangJobs sources
!workers = length ranges
in
Worklist.withTemp ranges $ \wfile worklist ->

-- progress and logging
(if clangProgress || not clangVerbose
then withProgress worklist clangJobs sources
else id) $
withLog clangVerbose (void . evaluate . length) $ \stream -> do

-- run workers
let dataFile i = tmpDir </> "indexer-" <> show i <> ".data"
workerargs i = args ++
[ "-dump", dataFile i
, "--work_file", wfile
, "--worker_index", show i
, "--worker_count", show workers
]
forConcurrently_ [0 .. workers-1] $ \i -> bracket
-- createProcess_ because we don't want the stdout/stderr handles
-- to be closed
(createProcess_
"Cpp.index"
(proc clangIndex $ workerargs i)
{std_out = stream, std_err = stream})
cleanupProcess
$ \(_, _, _, ph) -> do
ex <- waitForProcess ph
case ex of
ExitSuccess -> return ()
ExitFailure i -> error $ unwords (clangIndex:workerargs i)
++ " returned exit code " ++ show i

-- return data file names
return $ map dataFile [0 .. workers-1]

writeToDB backend repo = mapM_ $ \dataFile -> do
dat <- BS.readFile dataFile
case deserializeGen (Proxy :: Proxy Compact) dat of
Left parseError -> error parseError
Right batch -> sendBatch backend repo batch

derive verbose deriveBin backend repo =
withExe "clang-derive" deriveBin $ \clangDerive -> do
let go service = spawnAndConcurrentLog verbose clangDerive
[ "--repo", showRepo repo
, "--service", service
]
case backendKind backend of
BackendEnv env -> do
fb303 <- newFb303 "gleandriver"
let state = GleanHandler.State fb303 env
withBackgroundFacebookService
(GleanHandler.fb303State state)
(GleanHandler.handler state)
CppServer.defaultOptions
$ \server ->
go ("localhost:" <> show (CppServer.serverPort server))
BackendThrift thrift -> do
let clientConfig = thriftBackendClientConfig thrift
go $ serviceToString (clientConfig_serv clientConfig)

withProgress :: Worklist.Worklist -> Int -> Int -> IO a -> IO a
withProgress worklist jobs total action = do
terminal <- ANSI.hSupportsANSI stdout
flush $ start terminal
x <- withAsync (showProgress terminal 0) $ const action
flush $ finish terminal
return x
where
flush f = f >> hFlush stdout

tmsg n =
unwords ["Indexed", show (total - n), "of", show total, "source files"]

start True = putStrLn $ tmsg total
start False = putStr $ unwords ["Indexing", show total, "source files: 0%"]

finish True = do
ANSI.cursorUpLine 1
putStrLn $ tmsg 0
finish False =
putStrLn $ " ... 100%\nIndexed " ++ show total ++ "source files"

message True before now = when (before /= now) $ do
ANSI.cursorUpLine 1
putStrLn $ tmsg now
message False before now = do
let tenth n = ((total - n) * 10) `div` total
t = tenth now
when (t /= 0 && t /= 10 && t /= tenth before) $
putStr $ " ... " <> show (tenth now) <> "0%"

showProgress terminal before = do
ranges <- mapM (Worklist.get worklist) [0 .. jobs-1]
let remaining = max 0 $ sum $ map (\(Worklist.Range i k) -> k-i) ranges
flush $ message terminal before remaining
threadDelay 1000000
showProgress terminal remaining

withLog :: Bool -> (String -> IO ()) -> (StdStream -> IO a) -> IO a
withLog verbose log act
| verbose = act Inherit
| otherwise = bracket createPipe (\(r,w) -> hClose r >> hClose w)
$ \(outRead, outWrite) ->
withAsync (log =<< hGetContents outRead)
$ const $ act $ UseHandle outWrite

-- | Simple concurrent logger. Spawn the process and asynchronously log
-- concise or full contents to stdout. Should use a fancy progress bar really
spawnAndConcurrentLog :: Bool -> FilePath -> [String] -> IO ()
spawnAndConcurrentLog verbose exe args = do
(_, Just hout, Just herr, ph) <- createProcess (proc exe args)
{ std_out = CreatePipe, std_err = CreatePipe }
ex <- withAsync (log hout) $ \asyncOut ->
withAsync (log herr) $ \asyncErr -> do
status <- waitForProcess ph
cancel asyncOut
cancel asyncErr
putStr "\n" >> hFlush stdout
return status
spawnAndConcurrentLog verbose exe args = withLog verbose log $ \stream -> do
(_, _, _, ph) <- createProcess (proc exe args)
{ std_out = stream, std_err = stream }
ex <- waitForProcess ph
case ex of
ExitSuccess -> return ()
ExitFailure i -> error $
unwords (exe:args) ++ " returned exit code " ++ show i
where
log h = mapM_ draw . lines =<< hGetContents h
draw s
| verbose = putStrLn s
| otherwise = putChar '.' >> hFlush stdout
log s = mapM_ (const $ putChar '.' >> hFlush stdout) (lines s)
`finally` do
putStr "\n"
hFlush stdout

--
-- We need to find clang-index and clang-derive in $PATH or in-tree
--
withExe :: FilePath -> Maybe FilePath -> (FilePath -> IO ()) -> IO ()
withExe :: FilePath -> Maybe FilePath -> (FilePath -> IO a) -> IO a
withExe _ (Just exePath) f = do
exeExists <- doesFileExist exePath
if exeExists
Expand Down
1 change: 1 addition & 0 deletions glean/lang/clang/glean-clang.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ flag opt

common deps
build-depends:
ansi-terminal ^>= 0.11,
array ^>=0.5.2.0,
async ^>=2.2.1,
base >=4.11.1 && <4.15,
Expand Down
13 changes: 12 additions & 1 deletion glean/lang/clang/index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,12 @@ DEFINE_string(clang_resource_dir, "", "PATH to Clang resource dir");
DEFINE_string(cdb_target, "", "Target name");
DEFINE_string(cdb_dir, "", "Directory with compile_commands.json in it");

// This is a hack to support parallel indexing in the Glean CLI
DEFINE_bool(
print_sources_count,
false,
"Print the number source files and exit");

static llvm::cl::OptionCategory indexerCategory("glean");

// This file implements some plumbing and the main function for the
Expand Down Expand Up @@ -229,7 +235,7 @@ struct Config {
// No logging when dumping to a file
should_log = false;
sender = fileWriter(FLAGS_dump);
} else {
} else if (!FLAGS_print_sources_count) {
fail("missing --service or --dump");
}

Expand Down Expand Up @@ -590,6 +596,11 @@ int main(int argc, char **argv) {

Config config(argc, argv);

if (FLAGS_print_sources_count) {
std::cout << config.sources.size();
return 0;
}

const auto work_counter = FLAGS_work_file.empty()
? worklist::serialCounter(0, config.sources.size())
: worklist::stealingCounter(
Expand Down
Loading

0 comments on commit 0930b0f

Please sign in to comment.