diff --git a/glean.cabal.in b/glean.cabal.in index 894a62ffc..0600c7698 100644 --- a/glean.cabal.in +++ b/glean.cabal.in @@ -589,6 +589,16 @@ library client-cpp build-depends: glean:rts +library interprocess + import: fb-haskell, deps + visibility: public + hs-source-dirs: glean/interprocess/hs + exposed-modules: + Glean.Interprocess.Counters + Glean.Interprocess.Worklist + build-depends: + glean:client-cpp + library schema import: fb-haskell, fb-cpp, deps visibility: public @@ -995,6 +1005,7 @@ library indexers glean:client-hs-local, glean:db, glean:handler, + glean:interprocess, glean:lib, glean:lsif, glean:stubs, diff --git a/glean/lang/clang/Glean/Indexer/Cpp.hs b/glean/lang/clang/Glean/Indexer/Cpp.hs index bce13a9ba..474f216f0 100644 --- a/glean/lang/clang/Glean/Indexer/Cpp.hs +++ b/glean/lang/clang/Glean/Indexer/Cpp.hs @@ -11,10 +11,13 @@ module Glean.Indexer.Cpp ( indexerWith, indexer, indexerNoDeriv, Clang(..) , findExecutableRecursive ) where +import Control.Concurrent (threadDelay) import Control.Concurrent.Async +import Control.Exception import Control.Monad import Data.Proxy import Options.Applicative +import qualified System.Console.ANSI as ANSI import System.Directory import System.Environment import System.Exit @@ -23,6 +26,7 @@ import System.IO import System.Process import Thrift.Protocol (deserializeGen) import Thrift.Protocol.Compact (Compact) +import Util.List (chunk) import Facebook.Fb303 import Facebook.Service @@ -32,6 +36,7 @@ import Glean.Indexer import Glean.LocalOrRemote ( BackendKind(..), LocalOrRemote(..), serializeInventory ) import Glean.Util.Service +import qualified Glean.Interprocess.Worklist as Worklist import qualified Data.ByteString as BS import qualified Glean.Handler as GleanHandler @@ -42,7 +47,9 @@ data Clang = Clang , clangDeriveBin :: Maybe FilePath -- ^ path to @clang-derive@ binary , clangCompileDBDir :: Maybe FilePath -- ^ (optional) path to pre-existing @compile_commands.json@ + , clangJobs :: Int -- ^ number of indexers to run concurrently , clangVerbose :: Bool -- ^ display debugging information + , clangProgress :: Bool -- ^ display indexing progress } deriving Show options :: Parser Clang @@ -56,10 +63,18 @@ options = do clangCompileDBDir <- optional $ strOption $ long "cdb" <> help "path to a directory containing an existing compile_commands.json file" + clangJobs <- option auto $ + short 'j' <> + long "jobs" <> + value 1 <> + help "run N indexers in parallel" clangVerbose <- switch $ short 'v' <> long "verbose" <> help "Enable verbose logging from subprocesses" + clangProgress <- switch $ + long "progress" <> + help "Display indexing progress even in verbose mode" return Clang{..} -- | Standard indexer, that also runs the deriver @@ -77,18 +92,17 @@ indexerWith deriveToo = Indexer { indexerShortName = "cpp-cmake", indexerDescription = "Index C++ code with CMake (via Clang)", indexerOptParser = options, - indexerRun = \Clang{..} backend repo IndexerParams{..} -> do + indexerRun = \clang@Clang{..} backend repo IndexerParams{..} -> do -- indexing let tmpDir = indexerOutput inventoryFile = tmpDir "inventory.data" - indexerData = tmpDir "indexer.data" generateInventory backend repo inventoryFile compileDBDir <- case clangCompileDBDir of Nothing -> cmake clangVerbose indexerRoot tmpDir >> return tmpDir Just dir -> return dir - index clangVerbose clangIndexBin inventoryFile - indexerRoot compileDBDir indexerData + indexerData <- + index clang inventoryFile indexerRoot compileDBDir indexerOutput writeToDB backend repo indexerData -- deriving @@ -96,80 +110,174 @@ indexerWith deriveToo = Indexer { derive clangVerbose clangDeriveBin backend repo } - where generateInventory backend repo outFile = - serializeInventory backend repo >>= BS.writeFile outFile + where + generateInventory backend repo outFile = + serializeInventory backend repo >>= BS.writeFile outFile + + cmake verbose srcDir tmpDir = withExe "cmake" Nothing $ \cmakeBin -> + spawnAndConcurrentLog verbose cmakeBin + [ "-DCMAKE_EXPORT_COMPILE_COMMANDS=1" + , "-S", srcDir + , "-B", tmpDir + ] - cmake verbose srcDir tmpDir = withExe "cmake" Nothing $ \cmakeBin -> - spawnAndConcurrentLog verbose cmakeBin - [ "-DCMAKE_EXPORT_COMPILE_COMMANDS=1" - , "-S", srcDir - , "-B", tmpDir + index Clang{..} inventory srcDir buildDir tmpDir = + withExe "clang-index" clangIndexBin $ \clangIndex -> do + let args = + [ "-cdb_dir", buildDir + , "-cdb_target", "all" + , "-root", srcDir + , "--inventory", inventory + , "-logtostderr" ] - index verbose indexBin inventory srcDir buildDir outFile = - withExe "clang-index" indexBin $ \clangIndex -> do - let args = [ "-cdb_dir", buildDir - , "-cdb_target", "all" - , "-root", srcDir - , "-dump", outFile - , "--inventory", inventory - , "-logtostderr" - ] - spawnAndConcurrentLog verbose clangIndex args - - writeToDB backend repo dataFile = do - dat <- BS.readFile dataFile - case deserializeGen (Proxy :: Proxy Compact) dat of - Left parseError -> error parseError - Right batch -> sendBatch backend repo batch - - derive verbose deriveBin backend repo = - withExe "clang-derive" deriveBin $ \clangDerive -> do - let go service = spawnAndConcurrentLog verbose clangDerive - [ "--repo", showRepo repo - , "--service", service - ] - case backendKind backend of - BackendEnv env -> do - fb303 <- newFb303 "gleandriver" - let state = GleanHandler.State fb303 env - withBackgroundFacebookService - (GleanHandler.fb303State state) - (GleanHandler.handler state) - CppServer.defaultOptions - $ \server -> - go ("localhost:" <> show (CppServer.serverPort server)) - BackendThrift thrift -> do - let clientConfig = thriftBackendClientConfig thrift - go $ serviceToString (clientConfig_serv clientConfig) + -- get the total number of source files + sources <- do + let pargs = args ++ ["--print_sources_count"] + s <- readProcess clangIndex pargs "" + case reads s of + [(sources,"")] -> return sources + _ -> error $ unwords (clangIndex:pargs) + ++ " produced unexpect output \"" ++ s ++ "\"" + + case sources of + 0 -> do + -- TODO: should this be an error? + putStrLn "No source files to index" + return [] + _ -> + -- set up worklist + let ranges = + map (\(i,n) -> Worklist.Range i (i+n)) $ chunk clangJobs sources + !workers = length ranges + in + Worklist.withTemp ranges $ \wfile worklist -> + + -- progress and logging + (if clangProgress || not clangVerbose + then withProgress worklist clangJobs sources + else id) $ + withLog clangVerbose (void . evaluate . length) $ \stream -> do + + -- run workers + let dataFile i = tmpDir "indexer-" <> show i <> ".data" + workerargs i = args ++ + [ "-dump", dataFile i + , "--work_file", wfile + , "--worker_index", show i + , "--worker_count", show workers + ] + forConcurrently_ [0 .. workers-1] $ \i -> bracket + -- createProcess_ because we don't want the stdout/stderr handles + -- to be closed + (createProcess_ + "Cpp.index" + (proc clangIndex $ workerargs i) + {std_out = stream, std_err = stream}) + cleanupProcess + $ \(_, _, _, ph) -> do + ex <- waitForProcess ph + case ex of + ExitSuccess -> return () + ExitFailure i -> error $ unwords (clangIndex:workerargs i) + ++ " returned exit code " ++ show i + + -- return data file names + return $ map dataFile [0 .. workers-1] + + writeToDB backend repo = mapM_ $ \dataFile -> do + dat <- BS.readFile dataFile + case deserializeGen (Proxy :: Proxy Compact) dat of + Left parseError -> error parseError + Right batch -> sendBatch backend repo batch + + derive verbose deriveBin backend repo = + withExe "clang-derive" deriveBin $ \clangDerive -> do + let go service = spawnAndConcurrentLog verbose clangDerive + [ "--repo", showRepo repo + , "--service", service + ] + case backendKind backend of + BackendEnv env -> do + fb303 <- newFb303 "gleandriver" + let state = GleanHandler.State fb303 env + withBackgroundFacebookService + (GleanHandler.fb303State state) + (GleanHandler.handler state) + CppServer.defaultOptions + $ \server -> + go ("localhost:" <> show (CppServer.serverPort server)) + BackendThrift thrift -> do + let clientConfig = thriftBackendClientConfig thrift + go $ serviceToString (clientConfig_serv clientConfig) + +withProgress :: Worklist.Worklist -> Int -> Int -> IO a -> IO a +withProgress worklist jobs total action = do + terminal <- ANSI.hSupportsANSI stdout + flush $ start terminal + x <- withAsync (showProgress terminal 0) $ const action + flush $ finish terminal + return x + where + flush f = f >> hFlush stdout + + tmsg n = + unwords ["Indexed", show (total - n), "of", show total, "source files"] + + start True = putStrLn $ tmsg total + start False = putStr $ unwords ["Indexing", show total, "source files: 0%"] + + finish True = do + ANSI.cursorUpLine 1 + putStrLn $ tmsg 0 + finish False = + putStrLn $ " ... 100%\nIndexed " ++ show total ++ "source files" + + message True before now = when (before /= now) $ do + ANSI.cursorUpLine 1 + putStrLn $ tmsg now + message False before now = do + let tenth n = ((total - n) * 10) `div` total + t = tenth now + when (t /= 0 && t /= 10 && t /= tenth before) $ + putStr $ " ... " <> show (tenth now) <> "0%" + + showProgress terminal before = do + ranges <- mapM (Worklist.get worklist) [0 .. jobs-1] + let remaining = max 0 $ sum $ map (\(Worklist.Range i k) -> k-i) ranges + flush $ message terminal before remaining + threadDelay 1000000 + showProgress terminal remaining + +withLog :: Bool -> (String -> IO ()) -> (StdStream -> IO a) -> IO a +withLog verbose log act + | verbose = act Inherit + | otherwise = bracket createPipe (\(r,w) -> hClose r >> hClose w) + $ \(outRead, outWrite) -> + withAsync (log =<< hGetContents outRead) + $ const $ act $ UseHandle outWrite -- | Simple concurrent logger. Spawn the process and asynchronously log -- concise or full contents to stdout. Should use a fancy progress bar really spawnAndConcurrentLog :: Bool -> FilePath -> [String] -> IO () -spawnAndConcurrentLog verbose exe args = do - (_, Just hout, Just herr, ph) <- createProcess (proc exe args) - { std_out = CreatePipe, std_err = CreatePipe } - ex <- withAsync (log hout) $ \asyncOut -> - withAsync (log herr) $ \asyncErr -> do - status <- waitForProcess ph - cancel asyncOut - cancel asyncErr - putStr "\n" >> hFlush stdout - return status +spawnAndConcurrentLog verbose exe args = withLog verbose log $ \stream -> do + (_, _, _, ph) <- createProcess (proc exe args) + { std_out = stream, std_err = stream } + ex <- waitForProcess ph case ex of ExitSuccess -> return () ExitFailure i -> error $ unwords (exe:args) ++ " returned exit code " ++ show i where - log h = mapM_ draw . lines =<< hGetContents h - draw s - | verbose = putStrLn s - | otherwise = putChar '.' >> hFlush stdout + log s = mapM_ (const $ putChar '.' >> hFlush stdout) (lines s) + `finally` do + putStr "\n" + hFlush stdout -- -- We need to find clang-index and clang-derive in $PATH or in-tree -- -withExe :: FilePath -> Maybe FilePath -> (FilePath -> IO ()) -> IO () +withExe :: FilePath -> Maybe FilePath -> (FilePath -> IO a) -> IO a withExe _ (Just exePath) f = do exeExists <- doesFileExist exePath if exeExists diff --git a/glean/lang/clang/glean-clang.cabal b/glean/lang/clang/glean-clang.cabal index 31a213d98..8b823189d 100644 --- a/glean/lang/clang/glean-clang.cabal +++ b/glean/lang/clang/glean-clang.cabal @@ -38,6 +38,7 @@ flag opt common deps build-depends: + ansi-terminal ^>= 0.11, array ^>=0.5.2.0, async ^>=2.2.1, base >=4.11.1 && <4.15, diff --git a/glean/lang/clang/index.cpp b/glean/lang/clang/index.cpp index b7df388d9..81e530e29 100644 --- a/glean/lang/clang/index.cpp +++ b/glean/lang/clang/index.cpp @@ -91,6 +91,12 @@ DEFINE_string(clang_resource_dir, "", "PATH to Clang resource dir"); DEFINE_string(cdb_target, "", "Target name"); DEFINE_string(cdb_dir, "", "Directory with compile_commands.json in it"); +// This is a hack to support parallel indexing in the Glean CLI +DEFINE_bool( + print_sources_count, + false, + "Print the number source files and exit"); + static llvm::cl::OptionCategory indexerCategory("glean"); // This file implements some plumbing and the main function for the @@ -229,7 +235,7 @@ struct Config { // No logging when dumping to a file should_log = false; sender = fileWriter(FLAGS_dump); - } else { + } else if (!FLAGS_print_sources_count) { fail("missing --service or --dump"); } @@ -590,6 +596,11 @@ int main(int argc, char **argv) { Config config(argc, argv); + if (FLAGS_print_sources_count) { + std::cout << config.sources.size(); + return 0; + } + const auto work_counter = FLAGS_work_file.empty() ? worklist::serialCounter(0, config.sources.size()) : worklist::stealingCounter( diff --git a/index-llvm.sh b/index-llvm.sh new file mode 100755 index 000000000..bda5b4058 --- /dev/null +++ b/index-llvm.sh @@ -0,0 +1,216 @@ +#!/usr/bin/env bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -e +shopt -s extglob +shopt -s inherit_errexit + +fatal() { + echo "$@" 1>&2 + exit 1 +} + +usage() { + cat << EOF +Usage: $(basename "$0") FLAGS SOURCE_DIR TARGET_DIR + + Index LLVM source distribution located in SOURCE_DIR into TARGET_DIR + +Example: + mkdir -p /tmp/llvm \\ + && ( wget -O - https://github.com/llvm/llvm-project/releases/download/llvmorg-14.0.6/llvm-14.0.6.src.tar.xz \\ + | tar -C /tmp/llvm -xJ ) \\ + && $0 -j$(grep -c '^processor' /proc/cpuinfo) /tmp/llvm/llvm-14.0.6.src /tmp/llvm + + +Available options: + --glean DIR Specify Glean source directory + -jN | --jobs N Use N concurrent indeces + --db-root DIR Store the generated database in DIR + (default TARGET_DIR/db) + --db DB Use provided name for the database + (default llvm/VERSION) + --overwrite Overwrite the database if it already exists + --schema DIR Use the schema in DIR (default is the schema in + Glean's source tree) +EOF +} + +argerror() { + echo "$@" >&2 + usage >&2 + exit 1 +} + +if [[ "$1" == "--help" ]] +then + usage + exit 0 +fi + +MAKE_ARGS=() +EXTRA_GLEAN_ARGS=() +EXTRA_GLEAN_INDEX_ARGS=() +VERBOSITY=0 +GLEAN_DB_ROOT= +GLEAN_SCHEMA= +GLEAN_DIR= + +while true +do + case "$1" in + (-j+([0-9])) + MAKE_ARGS+=("$1") + EXTRA_GLEAN_INDEX_ARGS+=("$1") + shift + ;; + -j|--jobs) + MAKE_ARGS+=("$1") + EXTRA_GLEAN_INDEX_ARGS+=("$1") + case "$2" in + (+([0-9])) + MAKE_ARGS+=("$2") + EXTRA_GLEAN_INDEX_ARGS+=("$2") + shift 2 + ;; + *) + argerror "Invalid number of jobs" + ;; + esac + ;; + -v|--verbose) + VERBOSITY=1 + EXTRA_GLEAN_INDEX_ARGS+=("-v") + shift + ;; + --db-root) + GLEAN_DB_ROOT="$2" + shift 2 + ;; + --schema) + GLEAN_SCHEMA="$2" + shift 2 + ;; + --db) + GLEAN_DB="$2" + shift 2 + ;; + --overwrite) + OVERWRITE=yes + shift + ;; + --glean) + GLEAN_DIR="$2" + shift 2 + ;; + -*) + argerror "Unsupported option $1" + ;; + *) + break + ;; + esac +done + +if [[ -z "$1" ]] +then + argerror "No source directory specified" +fi + +SOURCE_DIR="$1" +shift + +if [[ -z "$1" ]] +then + argerror "No destination directory specified" +fi + +OUTPUT_DIR="$1" +shift + +if [[ $# -ne 0 ]] +then + argerror "Extra arguments" +fi + +if [[ -z "${GLEAN_DIR}" ]] +then + # Look for any ancestor directory with glean.cabal.in - this allows us to + # move the script within the source tree + dir=$(dirname "$0") + while [[ "${dir}" != "/" ]] ; do + if [[ -f "${dir}"/glean.cabal.in ]] + then + GLEAN_DIR="${dir}" + break + fi + dir=$(dirname "${dir}") + done + if [[ -z "${GLEAN_DIR}" ]] ; then + fatal "Couldn't locate Glean source directory, please specify --glean" + fi +fi + +BUILD_DIR="${OUTPUT_DIR}"/build +mkdir -p "${BUILD_DIR}" +GLEAN_DB_ROOT="${GLEAN_DB_ROOT:-${OUTPUT_DIR}/db}" +mkdir -p "${GLEAN_DB_ROOT}" + +GLEAN_SCHEMA="${GLEAN_SCHEMA:-${GLEAN_DIR}/glean/schema/source}" + +# FD 5 is where we redirect all output +if [[ "${VERBOSITY}" == "1" ]] ; then + exec 5>&0 +else + exec 5> "${OUTPUT_DIR}/index-llvm.log" +fi + +echo "Building glean-clang" +make MODE=opt glean-clang >&5 + +requirebin() { + tmp="$("${GLEAN_DIR}/quick.sh" MODE=opt list-bin "$1")" + if ! [[ -x "${tmp}" ]] ; then + fatal "$1 doesn't exist at ${tmp}" + fi + echo "${tmp}" +} + +CLANG_INDEX=$(requirebin glean-clang:clang-index) +CLANG_DERIVE=$(requirebin glean-clang:clang-derive) +GLEAN=$(requirebin glean:glean) + +echo "Setting up ${BUILD_DIR}" +mkdir -p "${BUILD_DIR}" >&5 +"${CMAKE:-cmake}" \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ + -DLLVM_INCLUDE_BENCHMARKS=OFF -DLLVM_TARGETS_TO_BUILD=X86 \ + -S "${SOURCE_DIR}" -B "${BUILD_DIR}" >&5 + +echo "Generating LLVM code" +# We get all the *TableGen targets from make help which thankfully lists all +# available targets. We do want word splitting here so disable the corresponding +# check. +# shellcheck disable=SC2046 +(cd "${BUILD_DIR}" && \ + make "${MAKE_ARGS[@]}" intrinsics_gen acc_gen omp_gen llvm_vcsrevision_h \ + $(make help | sed -n '/^[.][.][.] [A-Za-z0-9]*TableGen$/ s/^....//p')) >&5 + +if [[ -z "${GLEAN_DB}" ]] ; then + GLEAN_DB=llvm/$(sed -n '/^Version/ {s/^Version: //p;q}' "${BUILD_DIR}/llvm.spec") +fi + +if [[ -n "${OVERWRITE}" ]] ; then + rm -rf "${GLEAN_DB_ROOT:?}/${GLEAN_DB}" +fi + +echo "Indexing ${GLEAN_DB} in ${GLEAN_DB_ROOT}" +"${GLEAN}" "${EXTRA_GLEAN_ARGS[@]}" \ + --schema "${GLEAN_SCHEMA}" --db-root "${GLEAN_DB_ROOT}" \ + index --db "${GLEAN_DB}" cpp-cmake \ + --indexer "${CLANG_INDEX}" --deriver "${CLANG_DERIVE}" --cdb "${BUILD_DIR}" \ + --verbose --progress "${EXTRA_GLEAN_INDEX_ARGS[@]}" "${SOURCE_DIR}" 2>&5 diff --git a/mk/cxx.mk b/mk/cxx.mk index 5282739af..f49132f81 100644 --- a/mk/cxx.mk +++ b/mk/cxx.mk @@ -85,6 +85,7 @@ CXX_LIBRARIES = $(subst CXX_SOURCES_,,$(filter CXX_SOURCES_%, $(.VARIABLES))) .PHONY: cxx-libraries cxx-libraries: + @: # Include the right settings # diff --git a/quick.sh b/quick.sh index 325d086c9..699b55a5f 100755 --- a/quick.sh +++ b/quick.sh @@ -16,7 +16,7 @@ MAKE_ARGS=() for arg in "$@"; do case $arg in - build|run|test) + build|run|test|list-bin) ACTION="$1" shift break @@ -42,4 +42,10 @@ make "${MAKE_ARGS[@]}" .build/current.sh glean.cabal cxx-libraries . .build/current.sh -call_cabal "${ACTION}" "${TARGET}" -- "$@" +CABAL_ARGS=() +# Suppress "Up to date" etc. for list-bin +if [ "$ACTION" = "list-bin" ]; then + CABAL_ARGS+=(-vsilent) +fi + +call_cabal "${CABAL_ARGS[@]}" "${ACTION}" "${TARGET}" -- "$@"