From 0930b0f2964d7942d73b87683331a568465783a5 Mon Sep 17 00:00:00 2001 From: Roman Leshchinskiy Date: Sun, 25 Sep 2022 05:23:28 -0700 Subject: [PATCH] Allow concurrent indexing in glean index cpp-cmake (#271) Summary: This adds support for concurrent indexing based on the existing worklist framework. There are some shenanigans related to progress reporting which mostly exist to make indexing LLVM nicer (script coming up). The other major addition is the index-llvm.sh script which automates indexing LLVM source distributions using the parallel indexer - try the example suggested in index-llvm.sh --help. I'm not sure where the script should live. On my server indexing takes about 10 min with -j16. Note that this includes https://github.com/facebookincubator/Glean/issues/269 - we don't have stacked PRs so that one should probably be landed first. Pull Request resolved: https://github.com/facebookincubator/Glean/pull/271 Reviewed By: simonmar Differential Revision: D39420798 Pulled By: pepeiborra fbshipit-source-id: 2dc81e20a20fb522e42557944cd4871f72099775 --- glean.cabal.in | 11 ++ glean/lang/clang/Glean/Indexer/Cpp.hs | 232 +++++++++++++++++++------- glean/lang/clang/glean-clang.cabal | 1 + glean/lang/clang/index.cpp | 13 +- index-llvm.sh | 216 ++++++++++++++++++++++++ mk/cxx.mk | 1 + quick.sh | 10 +- 7 files changed, 419 insertions(+), 65 deletions(-) create mode 100755 index-llvm.sh diff --git a/glean.cabal.in b/glean.cabal.in index 894a62ffc..0600c7698 100644 --- a/glean.cabal.in +++ b/glean.cabal.in @@ -589,6 +589,16 @@ library client-cpp build-depends: glean:rts +library interprocess + import: fb-haskell, deps + visibility: public + hs-source-dirs: glean/interprocess/hs + exposed-modules: + Glean.Interprocess.Counters + Glean.Interprocess.Worklist + build-depends: + glean:client-cpp + library schema import: fb-haskell, fb-cpp, deps visibility: public @@ -995,6 +1005,7 @@ library indexers glean:client-hs-local, glean:db, glean:handler, + glean:interprocess, glean:lib, glean:lsif, glean:stubs, diff --git a/glean/lang/clang/Glean/Indexer/Cpp.hs b/glean/lang/clang/Glean/Indexer/Cpp.hs index bce13a9ba..474f216f0 100644 --- a/glean/lang/clang/Glean/Indexer/Cpp.hs +++ b/glean/lang/clang/Glean/Indexer/Cpp.hs @@ -11,10 +11,13 @@ module Glean.Indexer.Cpp ( indexerWith, indexer, indexerNoDeriv, Clang(..) , findExecutableRecursive ) where +import Control.Concurrent (threadDelay) import Control.Concurrent.Async +import Control.Exception import Control.Monad import Data.Proxy import Options.Applicative +import qualified System.Console.ANSI as ANSI import System.Directory import System.Environment import System.Exit @@ -23,6 +26,7 @@ import System.IO import System.Process import Thrift.Protocol (deserializeGen) import Thrift.Protocol.Compact (Compact) +import Util.List (chunk) import Facebook.Fb303 import Facebook.Service @@ -32,6 +36,7 @@ import Glean.Indexer import Glean.LocalOrRemote ( BackendKind(..), LocalOrRemote(..), serializeInventory ) import Glean.Util.Service +import qualified Glean.Interprocess.Worklist as Worklist import qualified Data.ByteString as BS import qualified Glean.Handler as GleanHandler @@ -42,7 +47,9 @@ data Clang = Clang , clangDeriveBin :: Maybe FilePath -- ^ path to @clang-derive@ binary , clangCompileDBDir :: Maybe FilePath -- ^ (optional) path to pre-existing @compile_commands.json@ + , clangJobs :: Int -- ^ number of indexers to run concurrently , clangVerbose :: Bool -- ^ display debugging information + , clangProgress :: Bool -- ^ display indexing progress } deriving Show options :: Parser Clang @@ -56,10 +63,18 @@ options = do clangCompileDBDir <- optional $ strOption $ long "cdb" <> help "path to a directory containing an existing compile_commands.json file" + clangJobs <- option auto $ + short 'j' <> + long "jobs" <> + value 1 <> + help "run N indexers in parallel" clangVerbose <- switch $ short 'v' <> long "verbose" <> help "Enable verbose logging from subprocesses" + clangProgress <- switch $ + long "progress" <> + help "Display indexing progress even in verbose mode" return Clang{..} -- | Standard indexer, that also runs the deriver @@ -77,18 +92,17 @@ indexerWith deriveToo = Indexer { indexerShortName = "cpp-cmake", indexerDescription = "Index C++ code with CMake (via Clang)", indexerOptParser = options, - indexerRun = \Clang{..} backend repo IndexerParams{..} -> do + indexerRun = \clang@Clang{..} backend repo IndexerParams{..} -> do -- indexing let tmpDir = indexerOutput inventoryFile = tmpDir "inventory.data" - indexerData = tmpDir "indexer.data" generateInventory backend repo inventoryFile compileDBDir <- case clangCompileDBDir of Nothing -> cmake clangVerbose indexerRoot tmpDir >> return tmpDir Just dir -> return dir - index clangVerbose clangIndexBin inventoryFile - indexerRoot compileDBDir indexerData + indexerData <- + index clang inventoryFile indexerRoot compileDBDir indexerOutput writeToDB backend repo indexerData -- deriving @@ -96,80 +110,174 @@ indexerWith deriveToo = Indexer { derive clangVerbose clangDeriveBin backend repo } - where generateInventory backend repo outFile = - serializeInventory backend repo >>= BS.writeFile outFile + where + generateInventory backend repo outFile = + serializeInventory backend repo >>= BS.writeFile outFile + + cmake verbose srcDir tmpDir = withExe "cmake" Nothing $ \cmakeBin -> + spawnAndConcurrentLog verbose cmakeBin + [ "-DCMAKE_EXPORT_COMPILE_COMMANDS=1" + , "-S", srcDir + , "-B", tmpDir + ] - cmake verbose srcDir tmpDir = withExe "cmake" Nothing $ \cmakeBin -> - spawnAndConcurrentLog verbose cmakeBin - [ "-DCMAKE_EXPORT_COMPILE_COMMANDS=1" - , "-S", srcDir - , "-B", tmpDir + index Clang{..} inventory srcDir buildDir tmpDir = + withExe "clang-index" clangIndexBin $ \clangIndex -> do + let args = + [ "-cdb_dir", buildDir + , "-cdb_target", "all" + , "-root", srcDir + , "--inventory", inventory + , "-logtostderr" ] - index verbose indexBin inventory srcDir buildDir outFile = - withExe "clang-index" indexBin $ \clangIndex -> do - let args = [ "-cdb_dir", buildDir - , "-cdb_target", "all" - , "-root", srcDir - , "-dump", outFile - , "--inventory", inventory - , "-logtostderr" - ] - spawnAndConcurrentLog verbose clangIndex args - - writeToDB backend repo dataFile = do - dat <- BS.readFile dataFile - case deserializeGen (Proxy :: Proxy Compact) dat of - Left parseError -> error parseError - Right batch -> sendBatch backend repo batch - - derive verbose deriveBin backend repo = - withExe "clang-derive" deriveBin $ \clangDerive -> do - let go service = spawnAndConcurrentLog verbose clangDerive - [ "--repo", showRepo repo - , "--service", service - ] - case backendKind backend of - BackendEnv env -> do - fb303 <- newFb303 "gleandriver" - let state = GleanHandler.State fb303 env - withBackgroundFacebookService - (GleanHandler.fb303State state) - (GleanHandler.handler state) - CppServer.defaultOptions - $ \server -> - go ("localhost:" <> show (CppServer.serverPort server)) - BackendThrift thrift -> do - let clientConfig = thriftBackendClientConfig thrift - go $ serviceToString (clientConfig_serv clientConfig) + -- get the total number of source files + sources <- do + let pargs = args ++ ["--print_sources_count"] + s <- readProcess clangIndex pargs "" + case reads s of + [(sources,"")] -> return sources + _ -> error $ unwords (clangIndex:pargs) + ++ " produced unexpect output \"" ++ s ++ "\"" + + case sources of + 0 -> do + -- TODO: should this be an error? + putStrLn "No source files to index" + return [] + _ -> + -- set up worklist + let ranges = + map (\(i,n) -> Worklist.Range i (i+n)) $ chunk clangJobs sources + !workers = length ranges + in + Worklist.withTemp ranges $ \wfile worklist -> + + -- progress and logging + (if clangProgress || not clangVerbose + then withProgress worklist clangJobs sources + else id) $ + withLog clangVerbose (void . evaluate . length) $ \stream -> do + + -- run workers + let dataFile i = tmpDir "indexer-" <> show i <> ".data" + workerargs i = args ++ + [ "-dump", dataFile i + , "--work_file", wfile + , "--worker_index", show i + , "--worker_count", show workers + ] + forConcurrently_ [0 .. workers-1] $ \i -> bracket + -- createProcess_ because we don't want the stdout/stderr handles + -- to be closed + (createProcess_ + "Cpp.index" + (proc clangIndex $ workerargs i) + {std_out = stream, std_err = stream}) + cleanupProcess + $ \(_, _, _, ph) -> do + ex <- waitForProcess ph + case ex of + ExitSuccess -> return () + ExitFailure i -> error $ unwords (clangIndex:workerargs i) + ++ " returned exit code " ++ show i + + -- return data file names + return $ map dataFile [0 .. workers-1] + + writeToDB backend repo = mapM_ $ \dataFile -> do + dat <- BS.readFile dataFile + case deserializeGen (Proxy :: Proxy Compact) dat of + Left parseError -> error parseError + Right batch -> sendBatch backend repo batch + + derive verbose deriveBin backend repo = + withExe "clang-derive" deriveBin $ \clangDerive -> do + let go service = spawnAndConcurrentLog verbose clangDerive + [ "--repo", showRepo repo + , "--service", service + ] + case backendKind backend of + BackendEnv env -> do + fb303 <- newFb303 "gleandriver" + let state = GleanHandler.State fb303 env + withBackgroundFacebookService + (GleanHandler.fb303State state) + (GleanHandler.handler state) + CppServer.defaultOptions + $ \server -> + go ("localhost:" <> show (CppServer.serverPort server)) + BackendThrift thrift -> do + let clientConfig = thriftBackendClientConfig thrift + go $ serviceToString (clientConfig_serv clientConfig) + +withProgress :: Worklist.Worklist -> Int -> Int -> IO a -> IO a +withProgress worklist jobs total action = do + terminal <- ANSI.hSupportsANSI stdout + flush $ start terminal + x <- withAsync (showProgress terminal 0) $ const action + flush $ finish terminal + return x + where + flush f = f >> hFlush stdout + + tmsg n = + unwords ["Indexed", show (total - n), "of", show total, "source files"] + + start True = putStrLn $ tmsg total + start False = putStr $ unwords ["Indexing", show total, "source files: 0%"] + + finish True = do + ANSI.cursorUpLine 1 + putStrLn $ tmsg 0 + finish False = + putStrLn $ " ... 100%\nIndexed " ++ show total ++ "source files" + + message True before now = when (before /= now) $ do + ANSI.cursorUpLine 1 + putStrLn $ tmsg now + message False before now = do + let tenth n = ((total - n) * 10) `div` total + t = tenth now + when (t /= 0 && t /= 10 && t /= tenth before) $ + putStr $ " ... " <> show (tenth now) <> "0%" + + showProgress terminal before = do + ranges <- mapM (Worklist.get worklist) [0 .. jobs-1] + let remaining = max 0 $ sum $ map (\(Worklist.Range i k) -> k-i) ranges + flush $ message terminal before remaining + threadDelay 1000000 + showProgress terminal remaining + +withLog :: Bool -> (String -> IO ()) -> (StdStream -> IO a) -> IO a +withLog verbose log act + | verbose = act Inherit + | otherwise = bracket createPipe (\(r,w) -> hClose r >> hClose w) + $ \(outRead, outWrite) -> + withAsync (log =<< hGetContents outRead) + $ const $ act $ UseHandle outWrite -- | Simple concurrent logger. Spawn the process and asynchronously log -- concise or full contents to stdout. Should use a fancy progress bar really spawnAndConcurrentLog :: Bool -> FilePath -> [String] -> IO () -spawnAndConcurrentLog verbose exe args = do - (_, Just hout, Just herr, ph) <- createProcess (proc exe args) - { std_out = CreatePipe, std_err = CreatePipe } - ex <- withAsync (log hout) $ \asyncOut -> - withAsync (log herr) $ \asyncErr -> do - status <- waitForProcess ph - cancel asyncOut - cancel asyncErr - putStr "\n" >> hFlush stdout - return status +spawnAndConcurrentLog verbose exe args = withLog verbose log $ \stream -> do + (_, _, _, ph) <- createProcess (proc exe args) + { std_out = stream, std_err = stream } + ex <- waitForProcess ph case ex of ExitSuccess -> return () ExitFailure i -> error $ unwords (exe:args) ++ " returned exit code " ++ show i where - log h = mapM_ draw . lines =<< hGetContents h - draw s - | verbose = putStrLn s - | otherwise = putChar '.' >> hFlush stdout + log s = mapM_ (const $ putChar '.' >> hFlush stdout) (lines s) + `finally` do + putStr "\n" + hFlush stdout -- -- We need to find clang-index and clang-derive in $PATH or in-tree -- -withExe :: FilePath -> Maybe FilePath -> (FilePath -> IO ()) -> IO () +withExe :: FilePath -> Maybe FilePath -> (FilePath -> IO a) -> IO a withExe _ (Just exePath) f = do exeExists <- doesFileExist exePath if exeExists diff --git a/glean/lang/clang/glean-clang.cabal b/glean/lang/clang/glean-clang.cabal index 31a213d98..8b823189d 100644 --- a/glean/lang/clang/glean-clang.cabal +++ b/glean/lang/clang/glean-clang.cabal @@ -38,6 +38,7 @@ flag opt common deps build-depends: + ansi-terminal ^>= 0.11, array ^>=0.5.2.0, async ^>=2.2.1, base >=4.11.1 && <4.15, diff --git a/glean/lang/clang/index.cpp b/glean/lang/clang/index.cpp index b7df388d9..81e530e29 100644 --- a/glean/lang/clang/index.cpp +++ b/glean/lang/clang/index.cpp @@ -91,6 +91,12 @@ DEFINE_string(clang_resource_dir, "", "PATH to Clang resource dir"); DEFINE_string(cdb_target, "", "Target name"); DEFINE_string(cdb_dir, "", "Directory with compile_commands.json in it"); +// This is a hack to support parallel indexing in the Glean CLI +DEFINE_bool( + print_sources_count, + false, + "Print the number source files and exit"); + static llvm::cl::OptionCategory indexerCategory("glean"); // This file implements some plumbing and the main function for the @@ -229,7 +235,7 @@ struct Config { // No logging when dumping to a file should_log = false; sender = fileWriter(FLAGS_dump); - } else { + } else if (!FLAGS_print_sources_count) { fail("missing --service or --dump"); } @@ -590,6 +596,11 @@ int main(int argc, char **argv) { Config config(argc, argv); + if (FLAGS_print_sources_count) { + std::cout << config.sources.size(); + return 0; + } + const auto work_counter = FLAGS_work_file.empty() ? worklist::serialCounter(0, config.sources.size()) : worklist::stealingCounter( diff --git a/index-llvm.sh b/index-llvm.sh new file mode 100755 index 000000000..bda5b4058 --- /dev/null +++ b/index-llvm.sh @@ -0,0 +1,216 @@ +#!/usr/bin/env bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -e +shopt -s extglob +shopt -s inherit_errexit + +fatal() { + echo "$@" 1>&2 + exit 1 +} + +usage() { + cat << EOF +Usage: $(basename "$0") FLAGS SOURCE_DIR TARGET_DIR + + Index LLVM source distribution located in SOURCE_DIR into TARGET_DIR + +Example: + mkdir -p /tmp/llvm \\ + && ( wget -O - https://github.com/llvm/llvm-project/releases/download/llvmorg-14.0.6/llvm-14.0.6.src.tar.xz \\ + | tar -C /tmp/llvm -xJ ) \\ + && $0 -j$(grep -c '^processor' /proc/cpuinfo) /tmp/llvm/llvm-14.0.6.src /tmp/llvm + + +Available options: + --glean DIR Specify Glean source directory + -jN | --jobs N Use N concurrent indeces + --db-root DIR Store the generated database in DIR + (default TARGET_DIR/db) + --db DB Use provided name for the database + (default llvm/VERSION) + --overwrite Overwrite the database if it already exists + --schema DIR Use the schema in DIR (default is the schema in + Glean's source tree) +EOF +} + +argerror() { + echo "$@" >&2 + usage >&2 + exit 1 +} + +if [[ "$1" == "--help" ]] +then + usage + exit 0 +fi + +MAKE_ARGS=() +EXTRA_GLEAN_ARGS=() +EXTRA_GLEAN_INDEX_ARGS=() +VERBOSITY=0 +GLEAN_DB_ROOT= +GLEAN_SCHEMA= +GLEAN_DIR= + +while true +do + case "$1" in + (-j+([0-9])) + MAKE_ARGS+=("$1") + EXTRA_GLEAN_INDEX_ARGS+=("$1") + shift + ;; + -j|--jobs) + MAKE_ARGS+=("$1") + EXTRA_GLEAN_INDEX_ARGS+=("$1") + case "$2" in + (+([0-9])) + MAKE_ARGS+=("$2") + EXTRA_GLEAN_INDEX_ARGS+=("$2") + shift 2 + ;; + *) + argerror "Invalid number of jobs" + ;; + esac + ;; + -v|--verbose) + VERBOSITY=1 + EXTRA_GLEAN_INDEX_ARGS+=("-v") + shift + ;; + --db-root) + GLEAN_DB_ROOT="$2" + shift 2 + ;; + --schema) + GLEAN_SCHEMA="$2" + shift 2 + ;; + --db) + GLEAN_DB="$2" + shift 2 + ;; + --overwrite) + OVERWRITE=yes + shift + ;; + --glean) + GLEAN_DIR="$2" + shift 2 + ;; + -*) + argerror "Unsupported option $1" + ;; + *) + break + ;; + esac +done + +if [[ -z "$1" ]] +then + argerror "No source directory specified" +fi + +SOURCE_DIR="$1" +shift + +if [[ -z "$1" ]] +then + argerror "No destination directory specified" +fi + +OUTPUT_DIR="$1" +shift + +if [[ $# -ne 0 ]] +then + argerror "Extra arguments" +fi + +if [[ -z "${GLEAN_DIR}" ]] +then + # Look for any ancestor directory with glean.cabal.in - this allows us to + # move the script within the source tree + dir=$(dirname "$0") + while [[ "${dir}" != "/" ]] ; do + if [[ -f "${dir}"/glean.cabal.in ]] + then + GLEAN_DIR="${dir}" + break + fi + dir=$(dirname "${dir}") + done + if [[ -z "${GLEAN_DIR}" ]] ; then + fatal "Couldn't locate Glean source directory, please specify --glean" + fi +fi + +BUILD_DIR="${OUTPUT_DIR}"/build +mkdir -p "${BUILD_DIR}" +GLEAN_DB_ROOT="${GLEAN_DB_ROOT:-${OUTPUT_DIR}/db}" +mkdir -p "${GLEAN_DB_ROOT}" + +GLEAN_SCHEMA="${GLEAN_SCHEMA:-${GLEAN_DIR}/glean/schema/source}" + +# FD 5 is where we redirect all output +if [[ "${VERBOSITY}" == "1" ]] ; then + exec 5>&0 +else + exec 5> "${OUTPUT_DIR}/index-llvm.log" +fi + +echo "Building glean-clang" +make MODE=opt glean-clang >&5 + +requirebin() { + tmp="$("${GLEAN_DIR}/quick.sh" MODE=opt list-bin "$1")" + if ! [[ -x "${tmp}" ]] ; then + fatal "$1 doesn't exist at ${tmp}" + fi + echo "${tmp}" +} + +CLANG_INDEX=$(requirebin glean-clang:clang-index) +CLANG_DERIVE=$(requirebin glean-clang:clang-derive) +GLEAN=$(requirebin glean:glean) + +echo "Setting up ${BUILD_DIR}" +mkdir -p "${BUILD_DIR}" >&5 +"${CMAKE:-cmake}" \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ + -DLLVM_INCLUDE_BENCHMARKS=OFF -DLLVM_TARGETS_TO_BUILD=X86 \ + -S "${SOURCE_DIR}" -B "${BUILD_DIR}" >&5 + +echo "Generating LLVM code" +# We get all the *TableGen targets from make help which thankfully lists all +# available targets. We do want word splitting here so disable the corresponding +# check. +# shellcheck disable=SC2046 +(cd "${BUILD_DIR}" && \ + make "${MAKE_ARGS[@]}" intrinsics_gen acc_gen omp_gen llvm_vcsrevision_h \ + $(make help | sed -n '/^[.][.][.] [A-Za-z0-9]*TableGen$/ s/^....//p')) >&5 + +if [[ -z "${GLEAN_DB}" ]] ; then + GLEAN_DB=llvm/$(sed -n '/^Version/ {s/^Version: //p;q}' "${BUILD_DIR}/llvm.spec") +fi + +if [[ -n "${OVERWRITE}" ]] ; then + rm -rf "${GLEAN_DB_ROOT:?}/${GLEAN_DB}" +fi + +echo "Indexing ${GLEAN_DB} in ${GLEAN_DB_ROOT}" +"${GLEAN}" "${EXTRA_GLEAN_ARGS[@]}" \ + --schema "${GLEAN_SCHEMA}" --db-root "${GLEAN_DB_ROOT}" \ + index --db "${GLEAN_DB}" cpp-cmake \ + --indexer "${CLANG_INDEX}" --deriver "${CLANG_DERIVE}" --cdb "${BUILD_DIR}" \ + --verbose --progress "${EXTRA_GLEAN_INDEX_ARGS[@]}" "${SOURCE_DIR}" 2>&5 diff --git a/mk/cxx.mk b/mk/cxx.mk index 5282739af..f49132f81 100644 --- a/mk/cxx.mk +++ b/mk/cxx.mk @@ -85,6 +85,7 @@ CXX_LIBRARIES = $(subst CXX_SOURCES_,,$(filter CXX_SOURCES_%, $(.VARIABLES))) .PHONY: cxx-libraries cxx-libraries: + @: # Include the right settings # diff --git a/quick.sh b/quick.sh index 325d086c9..699b55a5f 100755 --- a/quick.sh +++ b/quick.sh @@ -16,7 +16,7 @@ MAKE_ARGS=() for arg in "$@"; do case $arg in - build|run|test) + build|run|test|list-bin) ACTION="$1" shift break @@ -42,4 +42,10 @@ make "${MAKE_ARGS[@]}" .build/current.sh glean.cabal cxx-libraries . .build/current.sh -call_cabal "${ACTION}" "${TARGET}" -- "$@" +CABAL_ARGS=() +# Suppress "Up to date" etc. for list-bin +if [ "$ACTION" = "list-bin" ]; then + CABAL_ARGS+=(-vsilent) +fi + +call_cabal "${CABAL_ARGS[@]}" "${ACTION}" "${TARGET}" -- "$@"