diff --git a/_bibliography/ASL_Bib.bib b/_bibliography/ASL_Bib.bib old mode 100755 new mode 100644 index 115ab807..07e23922 --- a/_bibliography/ASL_Bib.bib +++ b/_bibliography/ASL_Bib.bib @@ -1447,7 +1447,7 @@ @phdthesis{Starek2016 } @incollection{SpieserTreleavenEtAl2014, - author = {Spieser, K. and Treleaven, K. and Zhang, R. and Frazzoli, E. and Moгton, D. and Pavone, M.}, + author = {Spieser, K. and Treleaven, K. and Zhang, R. and Frazzoli, E. and Morton, D. and Pavone, M.}, title = {Toward a Systematic Approach to the Design and Evaluation of {Autonomous} {Mobility-on-Demand} Systems: A Case Study in {Singapore}}, booktitle = {Road Vehicle Automation}, year = {2014}, @@ -2159,6 +2159,8 @@ @Article{RossiIglesiasEtAl2018b url = {https://arxiv.org/abs/1709.04906}, } + + @inproceedings{RossiBandyopadhyayEtAl2018, author = {Rossi, F. and Bandyopadhyay, S. and Wolf, M. and Pavone, M.}, title = {Review of Multi-Agent Algorithms for Collective Behavior: a Structural Taxonomy}, @@ -2253,7 +2255,7 @@ @InProceedings{RibeiroLukeEtAl2023 doi = {10.46855/energy-proceedings-11033}, owner = {jthluke}, timestamp = {2024-10-28}, - url = {https://www.energy-proceedings.org/towards-a-24-7-carbon-free-electric-fleet%3A-a-digital-twin-framework/}, + url = {https://www.energy-proceedings.org/towards-a-24-7-carbon-free-electric-fleet%3A-a-digital-twin-framework}, } @inproceedings{ReidRovedaEtAl2014, @@ -2268,10 +2270,6 @@ @inproceedings{ReidRovedaEtAl2014 owner = {bylard}, timestamp = {2017-02-20} } - abstract = {This letter proposes a novel force-based task-orientation controller for interaction tasks with environmental orientation uncertainties. The main aim of the controller is to align the robot tool along the main task direction (e.g., along screwing, insertion, polishing, etc.) without the use of any external sensors (e.g., vision systems), relying only on end-effector wrench measurements/estimations. We propose a gradient descent-based orientation controller, enhancing its performance with the orientation predictions provided by a Gaussian Process model. Derivation of the controller is presented, together with simulation results (considering a probing task) and experimental results involving various re-orientation scenarios, i.e., i) a task with the robot in interaction with a soft environment, ii) a task with the robot in interaction with a stiff and inclined environment, and iii) a task to enable the assembly of a gear into its shaft. The proposed controller is compared against a state-of-the-art approach, highlighting its ability to re-orient the robot tool even in complex tasks (where the state-of-the-art method fails).}, - owner = {lpabon}, - timestamp = {2024-08-19} -} @article{RamirezPavoneEtAl2010, author = {Ramirez, J. L. and Pavone, M. and Frazzoli, E. and Miller, D. W.}, @@ -2747,6 +2745,7 @@ @InProceedings{NewdickChenEtAl2023 @inproceedings{NeiraBrownEtAl2024, author = {Neira, D. E. B. and Brown, R. and Sathe, P. and Wudarski, F. and Pavone, M. and Rieffel, E. G. and Venturelli, D.}, title = {Benchmarking the Operation of Quantum Heuristics and Ising Machines: Scoring Parameter Setting Strategies on Optimization Applications}, + booktitle = {}, year = {2024}, keywords = {sub}, note = {Submitted}, @@ -2938,15 +2937,16 @@ @inproceedings{LuoZhaoEtAl2023 @inproceedings{LuoSinhaEtAl2023, author = {Luo, R. and Sinha, R. and Sun, Y. and Hindy, A. and Zhao, S. and Savarese, S. and Schmerling, E. and Pavone, M.}, title = {Online Distribution Shift Detection via Recency Prediction}, - booktitle = {proc_IEEE_ICRA}, + booktitle = proc_IEEE_ICRA, year = {2024}, - abstract = {When deploying modern machine learning-enabled robotic systems in high-stakes applications, detecting distribution shift is critical. However, most existing methods for detecting distribution shift are not well-suited to robotics settings, where data often arrives in a streaming fashion and may be very high-dimensional. In this work, we present an online method for detecting distribution shift with guarantees on the false positive rate — i.e., when there is no distribution shift, our system is very unlikely (with probability < ε) to falsely issue an alert; any alerts that are issued should therefore be heeded. Our method is specifically designed for efficient detection even with high dimensional data, and it empirically achieves up to 11x faster detection on realistic robotics settings compared to prior work while maintaining a low false negative rate in practice (whenever there is a distribution shift in our experiments, our method indeed emits an alert). We demonstrate our approach in both simulation and hardware for a visual servoing task, and show that our method indeed issues an alert before a failure occurs.}, + abstract = {When deploying modern machine learning-enabled robotic systems in high-stakes applications, detecting distribution shift is critical. However, most existing methods for detecting distribution shift are not well-suited to robotics settings, where data often arrives in a streaming fashion and may be very high-dimensional. In this work, we present an online method for detecting distribution shift with guarantees on the false positive rate — i.e., when there is no distribution shift, our system is very unlikely (with probability $< \epsilon$) to falsely issue an alert; any alerts that are issued should therefore be heeded. Our method is specifically designed for efficient detection even with high dimensional data, and it empirically achieves up to 11x faster detection on realistic robotics settings compared to prior work while maintaining a low false negative rate in practice (whenever there is a distribution shift in our experiments, our method indeed emits an alert). We demonstrate our approach in both simulation and hardware for a visual servoing task, and show that our method indeed issues an alert before a failure occurs.}, keywords = {pub}, owner = {gammelli}, timestamp = {2024-09-19}, url = {https://ieeexplore.ieee.org/abstract/document/10611114} } + @inproceedings{LuoEtAl2022, author = {Luo, R. and Bhatnagar, A. and Wang, H. and Xiong, C. and Savarese, S. and Bai, Y. and Zhao, S. and Ermon, S. and Schmerling, E. and Pavone, M.}, title = {Local Calibration: Metrics and Recalibration}, @@ -3026,6 +3026,7 @@ @inproceedings{LorenzettiMcClellanEtAl2020 timestamp = {2019-12-02} } + @article{LorenzettiMcClellanEtAl2022, author = {Lorenzetti, J. and McClellan, A. and Farhat, C. and Pavone, M.}, title = {Linear Reduced-Order Model Predictive Control}, @@ -3067,6 +3068,7 @@ @inproceedings{LorenzettiChenEtAl2018 timestamp = {2019-09-25} } + @phdthesis{Lorenzetti2021, author = {Lorenzetti, J.}, title = {Reduced Order Model Predictive Control of High-Dimensional Systems}, @@ -3419,6 +3421,7 @@ @phdthesis{Landry2021 timestamp = {2021-12-06} } + @inproceedings{LacottePilanciEtAl2019, author = {Lacotte, J. and Pilanci, M. and Pavone, M.}, title = {High-Dimensional Optimization in Adaptive Random Subspaces}, @@ -4466,7 +4469,7 @@ @inproceedings{DiCuevasQuiñonesEtAl2024 title = {Martian Exploration of Lava Tubes (MELT) with ReachBot: Scientific Investigation and Concept of Operations}, booktitle = proc_ICSR, year = {2024}, - month = june, + month = jun, abstract = {As natural access points to the subsurface, lava tubes and other caves have become premier targets of planetary missions for astrobiological analyses. Few existing robotic paradigms, however, are able to explore such challenging environments. ReachBot is a robot that enables navigation in planetary caves by using extendable and retractable limbs to locomote. In this paper, we outline the potential science return and mission operations for a notional mission that deploys ReachBot to a martian lava tube. We describe the motivating science goals and provide a science traceability matrix to guide payload selection. We also develop a Concept of Operations (ConOps) for ReachBot, providing a framework for deployment and activities on Mars, analyzing mission risks, and developing mitigation strategies.}, owner = {amine}, url = {https://arxiv.org/abs/2406.13857}, @@ -4764,7 +4767,7 @@ @InProceedings{ChinchaliPergamentEtAl2020 booktitle = proc_ISER, year = {2020}, address = {Valetta, Malta}, - month = {March}, + month = mar, abstract = {Today's robotic fleets are increasingly measuring high-volume video and LIDAR sensory streams, which can be mined for valuable training data, such as rare scenes of road construction sites, to steadily improve robotic perception models. However, re-training perception models on growing volumes of rich sensory data in central compute servers (or the "cloud") places an enormous time and cost burden on network transfer, cloud storage, human annotation, and cloud computing resources. Hence, we introduce HarvestNet, an intelligent sampling algorithm that resides on-board a robot and reduces system bottlenecks by only storing rare, useful events to steadily improve perception models re-trained in the cloud. HarvestNet significantly improves the accuracy of machine-learning models on our novel dataset of road construction sites, field testing of self-driving cars, and streaming face recognition, while reducing cloud storage, dataset annotation time, and cloud compute time by between 65.7-81.3\%. Further, it is between 1.05-2.58x more accurate than baseline algorithms and scalably runs on embedded deep learning hardware.}, owner = {csandeep}, timestamp = {2020-11-09}, @@ -5078,7 +5081,7 @@ @unpublished{BrownBernalEtAl2022 @inproceedings{BrownEtAlCPAIOR2024, author = {Brown, R. A. and Venturelli, D. and Pavone, M. and Bernal Neira, D. E.}, title = {Accelerating Continuous Variable Coherent Ising Machines via Momentum}, - booktitle = {proc_CPAIOR}, + booktitle = proc_CPAIOR, year = {2024}, abstract = {The Coherent Ising Machine (CIM) is a non-conventional architecture that takes inspiration from physical annealing processes to solve Ising problems heuristically. Its dynamics are naturally continuous and described by a set of ordinary differential equations that have been proven to be useful for the optimization of continuous variables non-convex quadratic optimization problems. The dynamics of such Continuous Variable CIMs (CV-CIM) encourage optimization via optical pulses whose amplitudes are determined by the negative gradient of the objective; however, standard gradient descent is known to be trapped by local minima and hampered by poor problem conditioning. In this work, we propose to modify the CV-CIM dynamics using more sophisticated pulse injections based on tried-and-true optimization techniques such as momentum and Adam. Through numerical experiments, we show that the momentum and Adam updates can significantly speed up the CV-CIM’s convergence and improve sample diversity over the original CV-CIM dynamics. We also find that the Adam-CV-CIM’s performance is more stable as a function of feedback strength, especially on poorly conditioned instances, resulting in an algorithm that is more robust, reliable, and easily tunable. More broadly, we identify the CIM dynamical framework as a fertile opportunity for exploring the intersection of classical optimization and modern analog computing.}, keywords = {pub}, @@ -5224,19 +5227,6 @@ @inproceedings{BigazziEtAl2024 url = {https://arxiv.org/abs/2403.07076} } -@inproceedings{BazziShahidEtAl2024, - author = {Bazzi, M. and Shahid, A. and Agia, C. and Alora, J. and Forgione, M. and Piga, D. and Braghin, F. and Pavone, M. and Roveda, L.}, - title = {RoboMorph: In-Context Meta-Learning for Robot Dynamics Modeling}, - booktitle = proc_IFAC_ICINCO, - year = {2024}, - month = aug, - abstract = {The landscape of Deep Learning has experienced a major shift with the pervasive adoption of Transformer-based architectures, particularly in Natural Language Processing (NLP). Novel avenues for physical applications, such as solving Partial Differential Equations and Image Vision, have been explored. However, in challenging domains like robotics, where high non-linearity poses significant challenges, Transformer-based applications are scarce. While Transformers have been used to provide robots with knowledge about high-level tasks, few efforts have been made to perform system identification. This paper proposes a novel methodology to learn a meta-dynamical model of a high-dimensional physical system, such as the Franka robotic arm, using a Transformer-based architecture without prior knowledge of the system's physical parameters. The objective is to predict quantities of interest (end-effector pose and joint positions) given the torque signals for each joint. This prediction can be useful as a component for Deep Model Predictive Control frameworks in robotics. The meta-model establishes the correlation between torques and positions and predicts the output for the complete trajectory. This work provides empirical evidence of the efficacy of the in-context learning paradigm, suggesting future improvements in learning the dynamics of robotic systems without explicit knowledge of physical parameters. Code, videos, and supplementary materials can be found at project website. See this https://sites.google.com/view/robomorph.}, - address = {Porto, Portugal}, - owner = {agia}, - timestamp = {2024-10-30}, - url = {https://arxiv.org/abs/2409.11815} -} - @inproceedings{BerriaudElokdaEtAl2024, author = {Berriaud, D. and Elokda, E. and Jalota, D. and Frazzoli, E. and Pavone, M. and Dorfler, F.}, title = {To Spend or to Gain: Online Learning in Repeated Karma Auctions}, @@ -5244,7 +5234,7 @@ @inproceedings{BerriaudElokdaEtAl2024 year = {2024}, abstract = {Recent years have seen a surge of artificial currency-based mechanisms in contexts where monetary instruments are deemed unfair or inappropriate, e.g., for traffic congestion management or allocation of food donations. Yet the applicability of these mechanisms remains limited, since it is challenging for users to learn how to bid an artificial currency that has no value outside the mechanism. Indeed, users must learn the value of the currency as well as how to optimally spend it in a coupled manner. In this paper, we study learning to bid in two prominent classes of artificial currency auctions: those in which currency is issued at the beginning of a finite period only to be spent over the period; and those where in addition to the initial endowment currency is transferred among users by redistributing payments in each time step. In the latter class the currency has been referred to as karma, since users do not only spend karma to acquire public resources but also gain karma for yielding them. In both classes, we propose a simple learning strategy, called adaptive karma pacing strategy, and show that a) it is asymptotically optimal for a single agent bidding against a stationary competition; b) it leads to convergent learning dynamics when all agents adopt it; and c) it constitutes an approximate Nash equilibrium as the number of agents grows. This requires a novel analysis in comparison to adaptive pacing strategies in monetary auctions, since we depart from the classical assumption that the currency has known value outside the auctions. The analysis is further complicated by the possibility to both spend and gain currency in auctions with redistribution.}, address = {Edinburgh, United Kingdom}, - month = july, + month = jul, keywords = {sub}, owner = {devanshjalota}, timestamp = {2024-03-01}, @@ -5485,31 +5475,30 @@ @phdthesis{Allen2016 } @inproceedings{AgiaVilaEtAl2024, - author = {Agia, C. and Vila, {G. C.} and Bandyopadhyay, S. and Bayard, {D. S.} and Cheung, K. and Lee, {C. H.} and Wood, E. and Aenishanslin, I. and Ardito, S. and Fesq, L. and Pavone, M. and Nesnas, {I. A. D.}}, - title = {Modeling Considerations for Developing Deep Space Autonomous Spacecraft and Simulators}, - booktitle = proc_IEEE_AC, - year = {2024}, - abstract = {To extend the limited scope of autonomy used in prior missions for operation in distant and complex environments, there is a need to further develop and mature autonomy that jointly reasons over multiple subsystems, which we term system-level autonomy. System-level autonomy establishes situational awareness that resolves conflicting information across subsystems, which may necessitate the refinement and interconnection of the underlying spacecraft and environment onboard models. However, with a limited understanding of the assumptions and tradeoffs of modeling to arbitrary extents, designing onboard models to support system-level capabilities presents a significant challenge. In this paper, we provide a detailed analysis of the increasing levels of model fidelity for several key spacecraft subsystems, with the goal of informing future spacecraft functional- and system-level autonomy algorithms and the physics-based simulators on which they are validated. We do not argue for the adoption of a particular fidelity class of models but, instead, highlight the potential tradeoffs and opportunities associated with the use of models for onboard autonomy and in physics-based simulators at various fidelity levels. We ground our analysis in the context of deep space exploration of small bodies, an emerging frontier for autonomous spacecraft operation in space, where the choice of models employed onboard the spacecraft may determine mission success. We conduct our experiments in the Multi-Spacecraft Concept and Autonomy Tool (MuSCAT), a software suite for developing spacecraft autonomy algorithms.}, - address = {Big Sky, Montana}, - month = mar, - url = {https://arxiv.org/abs/2401.11371}, - owner = {agia}, - timestamp = {2024-10-30} + author = {Agia, C. and Vila, {G. C.} and Bandyopadhyay, S. and Bayard, {D. S.} and Cheung, K. and Lee, {C. H.} and Wood, E. and Aenishanslin, I. and Ardito, S. and Fesq, L. and Pavone, M. and Nesnas, {I. A. D.}}, + title = {Modeling Considerations for Developing Deep Space Autonomous Spacecraft and Simulators}, + booktitle = proc_IEEE_AC, + year = {2024}, + asl_abstract = {To extend the limited scope of autonomy used in prior missions for operation in distant and complex environments, there is a need to further develop and mature autonomy that jointly reasons over multiple subsystems, which we term system-level autonomy. System-level autonomy establishes situational awareness that resolves conflicting information across subsystems, which may necessitate the refinement and interconnection of the underlying spacecraft and environment onboard models. However, with a limited understanding of the assumptions and tradeoffs of modeling to arbitrary extents, designing onboard models to support system-level capabilities presents a significant challenge. In this paper, we provide a detailed analysis of the increasing levels of model fidelity for several key spacecraft subsystems, with the goal of informing future spacecraft functional- and system-level autonomy algorithms and the physics-based simulators on which they are validated. We do not argue for the adoption of a particular fidelity class of models but, instead, highlight the potential tradeoffs and opportunities associated with the use of models for onboard autonomy and in physics-based simulators at various fidelity levels. We ground our analysis in the context of deep space exploration of small bodies, an emerging frontier for autonomous spacecraft operation in space, where the choice of models employed onboard the spacecraft may determine mission success. We conduct our experiments in the Multi-Spacecraft Concept and Autonomy Tool (MuSCAT), a software suite for developing spacecraft autonomy algorithms.}, + asl_address = {Big Sky, Montana}, + asl_month = mar, + asl_url = {https://arxiv.org/abs/2401.11371}, + owner = {agia}, + timestamp = {2024-03-01} } -@inproceedings{AgiaSinhaEtAl2024, +@Article{AgiaSinhaEtAl2024, author = {Agia, C. and Sinha, R. and Yang, J. and Cao, Z. and Antonova, R. and Pavone, M. and Jeannette Bohg}, title = {Unpacking Failure Modes of Generative Policies: Runtime Monitoring of Consistency and Progress}, - booktitle = proc_CoRL, year = {2024}, month = nov, abstract = {Robot behavior policies trained via imitation learning are prone to failure under conditions that deviate from their training data. Thus, algorithms that monitor learned policies at test time and provide early warnings of failure are necessary to facilitate scalable deployment. We propose Sentinel, a runtime monitoring framework that splits the detection of failures into two complementary categories: 1) Erratic failures, which we detect using statistical measures of temporal action consistency, and 2) task progression failures, where we use Vision Language Models (VLMs) to detect when the policy confidently and consistently takes actions that do not solve the task. Our approach has two key strengths. First, because learned policies exhibit diverse failure modes, combining complementary detectors leads to significantly higher accuracy at failure detection. Second, using a statistical temporal action consistency measure ensures that we quickly detect when multimodal, generative policies exhibit erratic behavior at negligible computational cost. In contrast, we only use VLMs to detect failure modes that are less time-sensitive. We demonstrate our approach in the context of diffusion policies trained on robotic mobile manipulation domains in both simulation and the real world. By unifying temporal consistency detection and VLM runtime monitoring, Sentinel detects 18\% more failures than using either of the two detectors alone and significantly outperforms baselines, thus highlighting the importance of assigning specialized detectors to complementary categories of failure. Qualitative results are made available at sites.google.com/stanford.edu/sentinel.}, address = {Munich, Germany}, + booktitle = proc_CoRL, keywords = {press}, - note = {In press}, - owner = {agia}, - timestamp = {2024-10-30}, - url = {https://arxiv.org/abs/2410.04640} + owner = {jthluke}, + timestamp = {2024-10-28}, + url = {https://arxiv.org/abs/2410.04640}, } @inproceedings{AbtahiLandryEtAl2019, @@ -5528,3 +5517,4 @@ @inproceedings{AbtahiLandryEtAl2019 @Comment{jabref-meta: databaseType:bibtex;} @Comment{jabref-meta: saveOrderConfig:specified;citationkey;false;author;true;title;true;} + diff --git a/_bibliography/AVG_papers.bib b/_bibliography/AVG_papers.bib index 6b2abbc1..f02de1f6 100644 --- a/_bibliography/AVG_papers.bib +++ b/_bibliography/AVG_papers.bib @@ -733,7 +733,7 @@ @InProceedings{YangPavone2023b year = {2023}, address = {Vancouver, Canada}, month = jun, - abstract = {The two-stage object pose estimation paradigm first detects semantic keypoints on the image and then estimates the 6D pose by minimizing reprojection errors. Despite performing well on standard benchmarks, existing techniques offer no provable guarantees on the quality and uncertainty of the estimation. In this paper, we inject two fundamental changes, namely conformal keypoint detection and geometric uncertainty propagation, into the two-stage paradigm and propose the first pose estimator that endows an estimation with provable and computable worst-case error bounds. On one hand, conformal keypoint detection applies the statistical machinery of inductive conformal prediction to convert heuristic keypoint detections into circular or elliptical prediction sets that cover the groundtruth keypoints with a user-specified marginal probability (e.g., 90%). Geometric uncertainty propagation, on the other, propagates the geometric constraints on the keypoints to the 6D object pose, leading to a Pose UnceRtainty SEt (PURSE) that guarantees coverage of the groundtruth pose with the same probability. The PURSE, however, is a nonconvex set that does not directly lead to estimated poses and uncertainties. Therefore, we develop RANdom SAmple averaGing (RANSAG) to compute an average pose and apply semidefinite relaxation to upper bound the worst-case errors between the average pose and the groundtruth. On the LineMOD Occlusion dataset we demonstrate: (i) the PURSE covers the groundtruth with valid probabilities; (ii) the worst-case error bounds provide correct uncertainty quantification; and (iii) the average pose achieves better or similar accuracy as representative methods based on sparse keypoints.}, + abstract = {The two-stage object pose estimation paradigm first detects semantic keypoints on the image and then estimates the 6D pose by minimizing reprojection errors. Despite performing well on standard benchmarks, existing techniques offer no provable guarantees on the quality and uncertainty of the estimation. In this paper, we inject two fundamental changes, namely conformal keypoint detection and geometric uncertainty propagation, into the two-stage paradigm and propose the first pose estimator that endows an estimation with provable and computable worst-case error bounds. On one hand, conformal keypoint detection applies the statistical machinery of inductive conformal prediction to convert heuristic keypoint detections into circular or elliptical prediction sets that cover the groundtruth keypoints with a user-specified marginal probability (e.g., 90\%). Geometric uncertainty propagation, on the other, propagates the geometric constraints on the keypoints to the 6D object pose, leading to a Pose UnceRtainty SEt (PURSE) that guarantees coverage of the groundtruth pose with the same probability. The PURSE, however, is a nonconvex set that does not directly lead to estimated poses and uncertainties. Therefore, we develop RANdom SAmple averaGing (RANSAG) to compute an average pose and apply semidefinite relaxation to upper bound the worst-case errors between the average pose and the groundtruth. On the LineMOD Occlusion dataset we demonstrate: (i) the PURSE covers the groundtruth with valid probabilities; (ii) the worst-case error bounds provide correct uncertainty quantification; and (iii) the average pose achieves better or similar accuracy as representative methods based on sparse keypoints.}, doi = {10.1109/CVPR52729.2023.00864}, owner = {jthluke}, timestamp = {2024-09-20}, @@ -772,7 +772,7 @@ @inproceedings{GuSongEtAl2024 title = {Producing and Leveraging Online Map Uncertainty in Trajectory Prediction}, booktitle = proc_IEEE_CVPR, year = {2024}, - abstract = {High-definition (HD) maps have played an integral role in the development of modern autonomous vehicle (AV) stacks, albeit with high associated labeling and maintenance costs. As a result, many recent works have proposed methods for estimating HD maps online from sensor data, enabling AVs to operate outside of previously-mapped regions. However, current online map estimation approaches are developed in isolation of their downstream tasks, complicating their integration in AV stacks. In particular, they do not produce uncertainty or confidence estimates. In this work, we extend multiple state-of-the-art online map estimation methods to additionally estimate uncertainty and show how this enables more tightly integrating online mapping with trajectory forecasting. In doing so, we find that incorporating uncertainty yields up to 50% faster training convergence and up to 15% better prediction performance on the real-world nuScenes driving dataset.}, + abstract = {High-definition (HD) maps have played an integral role in the development of modern autonomous vehicle (AV) stacks, albeit with high associated labeling and maintenance costs. As a result, many recent works have proposed methods for estimating HD maps online from sensor data, enabling AVs to operate outside of previously-mapped regions. However, current online map estimation approaches are developed in isolation of their downstream tasks, complicating their integration in AV stacks. In particular, they do not produce uncertainty or confidence estimates. In this work, we extend multiple state-of-the-art online map estimation methods to additionally estimate uncertainty and show how this enables more tightly integrating online mapping with trajectory forecasting. In doing so, we find that incorporating uncertainty yields up to 50\% faster training convergence and up to 15\% better prediction performance on the real-world nuScenes driving dataset.}, keywords = {pub}, owner = {devanshjalota}, timestamp = {2024-09-18}, @@ -835,7 +835,7 @@ @inproceedings{SinghWangEtAl2024 booktitle = proc_ICML, owner = {devanshjalota}, timestamp = {2024-09-18}, - abstract = {While modern best practices advocate for scalable architectures that support long-range interactions, object-centric models are yet to fully embrace these architectures. In particular, existing object-centric models for handling sequential inputs, due to their reliance on RNN-based implementation, show poor stability and capacity and are slow to train on long sequences. We introduce Parallelizable Spatiotemporal Binder or PSB, the first temporally-parallelizable slot learning architecture for sequential inputs. Unlike conventional RNN-based approaches, PSB produces object-centric representations, known as slots, for all time-steps in parallel. This is achieved by refining the initial slots across all time-steps through a fixed number of layers equipped with causal attention. By capitalizing on the parallelism induced by our architecture, the proposed model exhibits a significant boost in efficiency. In experiments, we test PSB extensively as an encoder within an auto-encoding framework paired with a wide variety of decoder options. Compared to the state-of-the-art, our architecture demonstrates stable training on longer sequences, achieves parallelization that results in a 60% increase in training speed, and yields performance that is on par with or better on unsupervised 2D and 3D object-centric scene decomposition and understanding.}, + abstract = {While modern best practices advocate for scalable architectures that support long-range interactions, object-centric models are yet to fully embrace these architectures. In particular, existing object-centric models for handling sequential inputs, due to their reliance on RNN-based implementation, show poor stability and capacity and are slow to train on long sequences. We introduce Parallelizable Spatiotemporal Binder or PSB, the first temporally-parallelizable slot learning architecture for sequential inputs. Unlike conventional RNN-based approaches, PSB produces object-centric representations, known as slots, for all time-steps in parallel. This is achieved by refining the initial slots across all time-steps through a fixed number of layers equipped with causal attention. By capitalizing on the parallelism induced by our architecture, the proposed model exhibits a significant boost in efficiency. In experiments, we test PSB extensively as an encoder within an auto-encoding framework paired with a wide variety of decoder options. Compared to the state-of-the-art, our architecture demonstrates stable training on longer sequences, achieves parallelization that results in a 60\% increase in training speed, and yields performance that is on par with or better on unsupervised 2D and 3D object-centric scene decomposition and understanding.}, url = {https://openreview.net/pdf?id=KpeGdDzucX}, address = {Vienna, Austria}, month = jul @@ -924,11 +924,11 @@ @InProceedings{ZhongRempeEtAl2023b editor = {Tan, Jie and Toussaint, Marc and Darvish, Kourosh}, volume = {229}, series = {Proceedings of Machine Learning Research}, - month = {06--09 Nov}, + month = nov, publisher = {PMLR}, pdf = {https://proceedings.mlr.press/v229/zhong23a/zhong23a.pdf}, url = {https://proceedings.mlr.press/v229/zhong23a.html}, - abstract = {Realistic and controllable traffic simulation is a core capability that is necessary to accelerate autonomous vehicle (AV) development. However, current approaches for controlling learning-based traffic models require significant domain expertise and are difficult for practitioners to use. To remedy this, we present CTG++, a scene-level conditional diffusion model that can be guided by language instructions. Developing this requires tackling two challenges: the need for a realistic and controllable traffic model backbone, and an effective method to interface with a traffic model using language. To address these challenges, we first propose a scene-level diffusion model equipped with a spatio-temporal transformer backbone, which generates realistic and controllable traffic. We then harness a large language model (LLM) to convert a user’s query into a loss function, guiding the diffusion model towards query-compliant generation. Through comprehensive evaluation, we demonstrate the effectiveness of our proposed method in generating realistic, query-compliant traffic simulations.} + abstract = {Realistic and controllable traffic simulation is a core capability that is necessary to accelerate autonomous vehicle (AV) development. However, current approaches for controlling learning-based traffic models require significant domain expertise and are difficult for practitioners to use. To remedy this, we present CTG++, a scene-level conditional diffusion model that can be guided by language instructions. Developing this requires tackling two challenges: the need for a realistic and controllable traffic model backbone, and an effective method to interface with a traffic model using language. To address these challenges, we first propose a scene-level diffusion model equipped with a spatio-temporal transformer backbone, which generates realistic and controllable traffic. We then harness a large language model (LLM) to convert a user’s query into a loss function, guiding the diffusion model towards query-compliant generation. Through comprehensive evaluation, we demonstrate the effectiveness of our proposed method in generating realistic, query-compliant traffic simulations.}, owner={jjalora}, } @@ -973,12 +973,12 @@ @InProceedings{TopanEtAl2023 volume={}, number={}, pages={1-8}, - abstract={A critical task for developing safe autonomous driving stacks is to determine whether an obstacle is safety-critical, i.e., poses an imminent threat to the autonomous vehicle. Our previous work showed that Hamilton Jacobi reachability theory can be applied to compute interaction-dynamics-aware perception safety zones that better inform an ego vehicle’s perception module which obstacles are considered safety-critical. For completeness, these zones are typically larger than absolutely necessary, forcing the perception module to pay attention to a larger collection of objects for the sake of conservatism. As an improvement, we propose a maneuver-based decomposition of our safety zones that leverages information about the ego maneuver to reduce the zone volume. In particular, we propose a "temporal convolution" operation that produces safety zones for specific ego maneuvers, thus limiting the ego’s behavior to reduce the size of the safety zones. We show with numerical experiments that maneuver-based zones are significantly smaller (up to 76% size reduction) than the baseline while maintaining completeness.}, + abstract={A critical task for developing safe autonomous driving stacks is to determine whether an obstacle is safety-critical, i.e., poses an imminent threat to the autonomous vehicle. Our previous work showed that Hamilton Jacobi reachability theory can be applied to compute interaction-dynamics-aware perception safety zones that better inform an ego vehicle’s perception module which obstacles are considered safety-critical. For completeness, these zones are typically larger than absolutely necessary, forcing the perception module to pay attention to a larger collection of objects for the sake of conservatism. As an improvement, we propose a maneuver-based decomposition of our safety zones that leverages information about the ego maneuver to reduce the zone volume. In particular, we propose a "temporal convolution" operation that produces safety zones for specific ego maneuvers, thus limiting the ego’s behavior to reduce the size of the safety zones. We show with numerical experiments that maneuver-based zones are significantly smaller (up to 76\% size reduction) than the baseline while maintaining completeness.}, doi={10.1109/IV55152.2023.10186702}, ISSN={2642-7214}, - month={June}, - owner={jjalora} - } + month=jun, + owner={jjalora}, +} @InProceedings{IvanovicHarrisonEtAl2023, @@ -1114,7 +1114,7 @@ @InProceedings{CaoXuEtAl2022b year = {2022}, address = {Auckland, New Zealand}, month = dec, - abstract = {Trajectory prediction using deep neural networks (DNNs) is an essential component of autonomous driving (AD) systems. However, these methods are vulnerable to adversarial attacks, leading to serious consequences such as collisions. In this work, we identify two key ingredients to defend trajectory prediction models against adversarial attacks including (1) designing effective adversarial training methods and (2) adding domain-specific data augmentation to mitigate the performance degradation on clean data. We demonstrate that our method is able to improve the performance by 46% on adversarial data and at the cost of only 3% performance degradation on clean data, compared to the model trained with clean data. Additionally, compared to existing robust methods, our method can improve performance by 21% on adversarial examples and 9% on clean data. Our robust model is evaluated with a planner to study its downstream impacts. We demonstrate that our model can significantly reduce the severe accident rates (e.g., collisions and off-road driving).}, + abstract = {Trajectory prediction using deep neural networks (DNNs) is an essential component of autonomous driving (AD) systems. However, these methods are vulnerable to adversarial attacks, leading to serious consequences such as collisions. In this work, we identify two key ingredients to defend trajectory prediction models against adversarial attacks including (1) designing effective adversarial training methods and (2) adding domain-specific data augmentation to mitigate the performance degradation on clean data. We demonstrate that our method is able to improve the performance by 46\% on adversarial data and at the cost of only 3\% performance degradation on clean data, compared to the model trained with clean data. Additionally, compared to existing robust methods, our method can improve performance by 21\% on adversarial examples and 9\% on clean data. Our robust model is evaluated with a planner to study its downstream impacts. We demonstrate that our model can significantly reduce the severe accident rates (e.g., collisions and off-road driving).}, owner = {jthluke}, timestamp = {2024-09-20}, url = {https://proceedings.mlr.press/v205/cao23a.html}, @@ -1187,7 +1187,7 @@ @inproceedings{PatrikarVeerEtAl2024 title = {RuleFuser: An Evidential Bayes Approach for Rule Injection in Imitation Learned Planners and Predictors for Robustness under Distribution Shifts}, booktitle = proc_ISRR, year = {2024}, - abstract = {Modern motion planners for autonomous driving frequently use imitation learning (IL) to draw from expert driving logs. Although IL benefits from its ability to glean nuanced and multi-modal human driving behaviors from large datasets, the resulting planners often struggle with out-of-distribution (OOD) scenarios and with traffic rule compliance. On the other hand, classical rule-based planners, by design, can generate safe traffic rule compliant behaviors while being robust to OOD scenarios, but these planners fail to capture nuances in agent-to-agent interactions and human drivers' intent. RuleFuser, an evidential framework, combines IL planners with classical rule-based planners to draw on the complementary benefits of both, thereby striking a balance between imitation and safety. Our approach, tested on the real-world nuPlan dataset, combines the IL planner's high performance in in-distribution (ID) scenarios with the rule-based planners' enhanced safety in out-of-distribution (OOD) scenarios, achieving a 38.43% average improvement on safety metrics over the IL planner without much detriment to imitation metrics in OOD scenarios.}, + abstract = {Modern motion planners for autonomous driving frequently use imitation learning (IL) to draw from expert driving logs. Although IL benefits from its ability to glean nuanced and multi-modal human driving behaviors from large datasets, the resulting planners often struggle with out-of-distribution (OOD) scenarios and with traffic rule compliance. On the other hand, classical rule-based planners, by design, can generate safe traffic rule compliant behaviors while being robust to OOD scenarios, but these planners fail to capture nuances in agent-to-agent interactions and human drivers' intent. RuleFuser, an evidential framework, combines IL planners with classical rule-based planners to draw on the complementary benefits of both, thereby striking a balance between imitation and safety. Our approach, tested on the real-world nuPlan dataset, combines the IL planner's high performance in in-distribution (ID) scenarios with the rule-based planners' enhanced safety in out-of-distribution (OOD) scenarios, achieving a 38.43\% average improvement on safety metrics over the IL planner without much detriment to imitation metrics in OOD scenarios.}, keywords = {press}, owner = {gammelli}, timestamp = {2024-09-19}, @@ -1199,7 +1199,7 @@ @inproceedings{LuoWengEtAl2024 title = {Augmenting lane perception and topology understanding with standard definition navigation maps}, booktitle = proc_IEEE_ICRA, year = {2024}, - abstract = {Autonomous driving has traditionally relied heavily on costly and labor-intensive High Definition (HD) maps, hindering scalability. In contrast, Standard Definition (SD) maps are more affordable and have worldwide coverage, offering a scalable alternative. In this work, we systematically explore the effect of SD maps for real-time lane-topology understanding. We propose a novel framework to integrate SD maps into online map prediction and propose a Transformer-based encoder, SD Map Encoder Representations from transFormers, to leverage priors in SD maps for the lane-topology prediction task. This enhancement consistently and significantly boosts (by up to 60%) lane detection and topology prediction on current state-of-the-art online map prediction methods without bells and whistles and can be immediately incorporated into any Transformer-based lane-topology method. Code is available at https://github.com/NVlabs/SMERF.}, + abstract = {Autonomous driving has traditionally relied heavily on costly and labor-intensive High Definition (HD) maps, hindering scalability. In contrast, Standard Definition (SD) maps are more affordable and have worldwide coverage, offering a scalable alternative. In this work, we systematically explore the effect of SD maps for real-time lane-topology understanding. We propose a novel framework to integrate SD maps into online map prediction and propose a Transformer-based encoder, SD Map Encoder Representations from transFormers, to leverage priors in SD maps for the lane-topology prediction task. This enhancement consistently and significantly boosts (by up to 60\%) lane detection and topology prediction on current state-of-the-art online map prediction methods without bells and whistles and can be immediately incorporated into any Transformer-based lane-topology method. Code is available at \url{https://github.com/NVlabs/SMERF}.}, keywords = {pub}, owner = {gammelli}, timestamp = {2024-09-19}, @@ -1367,7 +1367,7 @@ @inproceedings{LiZhuEtAl2024 title = {Wolf: Captioning Everthing with a World Summarization Framework}, booktitle = proc_ICLR, year = {2025}, - abstract = {We propose Wolf, a WOrLd summarization Framework for accurate video captioning. Wolf is an automated captioning framework that adopts a mixture-of-experts approach, leveraging complementary strengths of Vision Language Models (VLMs). By utilizing both image and video models, our framework captures different levels of information and summarizes them efficiently. Our approach can be applied to enhance video understanding, auto-labeling, and captioning. To evaluate caption quality, we introduce CapScore, an LLM-based metric to assess the similarity and quality of generated captions compared to the ground truth captions. We further build four human-annotated datasets in three domains: autonomous driving, general scenes, and robotics, to facilitate comprehensive comparisons. We show that Wolf achieves superior captioning performance compared to state-of-the-art approaches from the research community (VILA1.5, CogAgent) and commercial solutions (Gemini-Pro-1.5, GPT-4V). For instance, in comparison with GPT-4V, Wolf improves CapScore both quality-wise by 55.6% and similarity-wise by 77.4% on challenging driving videos. Finally, we establish a benchmark for video captioning and introduce a leaderboard, aiming to accelerate advancements in video understanding, captioning, and data alignment. Leaderboard: https://wolfv0.github.io/leaderboard.html.}, + abstract = {We propose Wolf, a WOrLd summarization Framework for accurate video captioning. Wolf is an automated captioning framework that adopts a mixture-of-experts approach, leveraging complementary strengths of Vision Language Models (VLMs). By utilizing both image and video models, our framework captures different levels of information and summarizes them efficiently. Our approach can be applied to enhance video understanding, auto-labeling, and captioning. To evaluate caption quality, we introduce CapScore, an LLM-based metric to assess the similarity and quality of generated captions compared to the ground truth captions. We further build four human-annotated datasets in three domains: autonomous driving, general scenes, and robotics, to facilitate comprehensive comparisons. We show that Wolf achieves superior captioning performance compared to state-of-the-art approaches from the research community (VILA1.5, CogAgent) and commercial solutions (Gemini-Pro-1.5, GPT-4V). For instance, in comparison with GPT-4V, Wolf improves CapScore both quality-wise by 55.6\% and similarity-wise by 77.4\% on challenging driving videos. Finally, we establish a benchmark for video captioning and introduce a leaderboard, aiming to accelerate advancements in video understanding, captioning, and data alignment. Leaderboard: \url{https://wolfv0.github.io/leaderboard.html}.}, keywords = {sub}, owner = {amine}, timestamp = {2024-09-19}, @@ -1376,17 +1376,17 @@ @inproceedings{LiZhuEtAl2024 @inproceedings{FangZhuEtAl2024, author = {Fang, Y. and Zhu, L. and Lu, Y. and Wang, Y. and Molchanov, P. and Cho, J. H. and Pavone, M. and Han, S. and Yin, H.}, - title = {$VILA^2$: VILA Augmented VILA}, + title = {\[VILA^2\]: VILA Augmented VILA}, booktitle = proc_NIPS, year = {2024}, - abstract = {Visual language models (VLMs) have rapidly progressed, driven by the success of large language models (LLMs). While model architectures and training infrastructures advance rapidly, data curation remains under-explored. When data quantity and quality become a bottleneck, existing work either directly crawls more raw data from the Internet that does not have a guarantee of data quality or distills from black-box commercial models (e.g., GPT-4V / Gemini) causing the performance upper bounded by that model. In this work, we introduce a novel approach that includes a self-augment step and a specialist-augment step to iteratively improve data quality and model performance. In the self-augment step, a VLM recaptions its own pretraining data to enhance data quality, and then retrains from scratch using this refined dataset to improve model performance. This process can iterate for several rounds. Once self-augmentation saturates, we employ several specialist VLMs finetuned from the self-augmented VLM with domain-specific expertise, to further infuse specialist knowledge into the generalist VLM through task-oriented recaptioning and retraining. With the combined self-augmented and specialist-augmented training, we introduce $VILA^2$ (VILA-augmented-VILA), a VLM family that consistently improves the accuracy on a wide range of tasks over prior art, and achieves new state-of-the-art results on MMMU leaderboard among open-sourced models.}, + abstract = {Visual language models (VLMs) have rapidly progressed, driven by the success of large language models (LLMs). While model architectures and training infrastructures advance rapidly, data curation remains under-explored. When data quantity and quality become a bottleneck, existing work either directly crawls more raw data from the Internet that does not have a guarantee of data quality or distills from black-box commercial models (e.g., GPT-4V / Gemini) causing the performance upper bounded by that model. In this work, we introduce a novel approach that includes a self-augment step and a specialist-augment step to iteratively improve data quality and model performance. In the self-augment step, a VLM recaptions its own pretraining data to enhance data quality, and then retrains from scratch using this refined dataset to improve model performance. This process can iterate for several rounds. Once self-augmentation saturates, we employ several specialist VLMs finetuned from the self-augmented VLM with domain-specific expertise, to further infuse specialist knowledge into the generalist VLM through task-oriented recaptioning and retraining. With the combined self-augmented and specialist-augmented training, we introduce \[VILA^2\] (VILA-augmented-VILA), a VLM family that consistently improves the accuracy on a wide range of tasks over prior art, and achieves new state-of-the-art results on MMMU leaderboard among open-sourced models.}, keywords = {press}, owner = {amine}, timestamp = {2024-09-19}, url = {https://arxiv.org/abs/2407.17453} } -@inproceedings{GuSongEtAl2024, +@inproceedings{GuSongEtAl2024a, author = {Gu, X. and Song, G. and Gilitschenski, I. and Pavone, M. and Ivanovic, B.}, title = {Accelerating Online Mapping and Behavior Prediction via Direct BEV Feature Attention}, booktitle = proc_ECCV, @@ -1439,14 +1439,14 @@ @inproceedings{FanWangEtAl2024 title = {Learning Traffic Crashes as Language: Datasets, Benchmarks, and What-if Causal Analyses}, booktitle = proc_NIPS, year = {2024}, - abstract = {The increasing rate of road accidents worldwide results not only in significant loss of life but also imposes billions financial burdens on societies. Current research in traffic crash frequency modeling and analysis has predominantly approached the problem as classification tasks, focusing mainly on learning-based classification or ensemble learning methods. These approaches often overlook the intricate relationships among the complex infrastructure, environmental, human and contextual factors related to traffic crashes and risky situations. In contrast, we initially propose a large-scale traffic crash language dataset, named CrashEvent, summarizing 19,340 real-world crash reports and incorporating infrastructure data, environmental and traffic textual and visual information in Washington State. Leveraging this rich dataset, we further formulate the crash event feature learning as a novel text reasoning problem and further fine-tune various large language models (LLMs) to predict detailed accident outcomes, such as crash types, severity and number of injuries, based on contextual and environmental factors. The proposed model, CrashLLM, distinguishes itself from existing solutions by leveraging the inherent text reasoning capabilities of LLMs to parse and learn from complex, unstructured data, thereby enabling a more nuanced analysis of contributing factors. Our experiments results shows that our LLM-based approach not only predicts the severity of accidents but also classifies different types of accidents and predicts injury outcomes, all with averaged F1 score boosted from 34.9% to 53.8%. Furthermore, CrashLLM can provide valuable insights for numerous open-world what-if situational-awareness traffic safety analyses with learned reasoning features, which existing models cannot offer. We make our benchmark, datasets, and model public available for further exploration.}, + abstract = {The increasing rate of road accidents worldwide results not only in significant loss of life but also imposes billions financial burdens on societies. Current research in traffic crash frequency modeling and analysis has predominantly approached the problem as classification tasks, focusing mainly on learning-based classification or ensemble learning methods. These approaches often overlook the intricate relationships among the complex infrastructure, environmental, human and contextual factors related to traffic crashes and risky situations. In contrast, we initially propose a large-scale traffic crash language dataset, named CrashEvent, summarizing 19,340 real-world crash reports and incorporating infrastructure data, environmental and traffic textual and visual information in Washington State. Leveraging this rich dataset, we further formulate the crash event feature learning as a novel text reasoning problem and further fine-tune various large language models (LLMs) to predict detailed accident outcomes, such as crash types, severity and number of injuries, based on contextual and environmental factors. The proposed model, CrashLLM, distinguishes itself from existing solutions by leveraging the inherent text reasoning capabilities of LLMs to parse and learn from complex, unstructured data, thereby enabling a more nuanced analysis of contributing factors. Our experiments results shows that our LLM-based approach not only predicts the severity of accidents but also classifies different types of accidents and predicts injury outcomes, all with averaged F1 score boosted from 34.9\% to 53.8\%. Furthermore, CrashLLM can provide valuable insights for numerous open-world what-if situational-awareness traffic safety analyses with learned reasoning features, which existing models cannot offer. We make our benchmark, datasets, and model public available for further exploration.}, keywords = {sub}, owner = {amine}, timestamp = {2024-09-19}, url = {https://arxiv.org/abs/2406.10789} } -@inproceedings{LiWangEtAl2024, +@inproceedings{LiWangEtAl2024a, author = {Li, Y. and Wang, Z. and Wang, Y. and Yu, Z. and Gojcic, Z. and Pavone, M. and Feng, C. and Alvarez, J. M.}, title = {Memorize What Matters: Emergent Scene Decomposition from Multitraverse}, booktitle = proc_NIPS,