diff --git a/_bibliography/AVG_papers.bib b/_bibliography/AVG_papers.bib index 735a1343..6b2abbc1 100644 --- a/_bibliography/AVG_papers.bib +++ b/_bibliography/AVG_papers.bib @@ -490,7 +490,7 @@ @String{proc_ACM_IPSN_symp @String{proc_ACM_KDDM = {{ACM Int.\ Conf.\ on Knowledge Discovery and Data Mining}}} @String{proc_ACM_MobiHoc = {{ACM Int.\ Symp.\ on Mobile Ad-Hoc Networking \& Computing}}} @String{proc_ACM_SIGCHI = {{ACM CHI Conf.\ on Human Factors in Computing Systems}}} -@String{proc_ACM_SIGGRAPH = {{Proc.\ of SIGGRAPH}}} +@String{proc_ACM_SIGGRAPH_ASIA = {{Proc.\ of SIGGRAPH Asia}}} @String{proc_ACM_SIGPLAN = {{ACM SIGPLAN Notices}}} @String{proc_ACM_SIGSPATIAL = {{ACM SIGSPATIAL}}} @String{proc_ACM_SOCG = {{ACM Symp.\ on Computational Geometry}}} @@ -1173,7 +1173,7 @@ @inproceedings{ChenIvanovicEtAl2022 @inproceedings{ChenLiuEtAl2024, author = {Chen, X. and Liu, Z. and Luo, K. Z. and Datta, S. and Polavaram, A. and Wang, Y. and You, Y. and Li, B. and Pavone, M. and Chao, W. L. and Campbell, M. and Hariharan, B. and Weinberger, K. Q.}, title = {DiffuBox: Refining 3D Object Detection with Point Diffusion}, - booktitle = {}, + booktitle = proc_NIPS, year = {2024}, abstract = {Ensuring robust 3D object detection and localization is crucial for many applications in robotics and autonomous driving. Recent models, however, face difficulties in maintaining high performance when applied to domains with differing sensor setups or geographic locations, often resulting in poor localization accuracy due to domain shift. To overcome this challenge, we introduce a novel diffusion-based box refinement approach. This method employs a domain-agnostic diffusion model, conditioned on the LiDAR points surrounding a coarse bounding box, to simultaneously refine the box's location, size, and orientation. We evaluate this approach under various domain adaptation settings, and our results reveal significant improvements across different datasets, object classes and detectors.}, keywords = {sub}, @@ -1185,10 +1185,10 @@ @inproceedings{ChenLiuEtAl2024 @inproceedings{PatrikarVeerEtAl2024, author = {Patrikar, J. and Veer, S. and Sharma, A. and Pavone, M. and Scherer, S.}, title = {RuleFuser: An Evidential Bayes Approach for Rule Injection in Imitation Learned Planners and Predictors for Robustness under Distribution Shifts}, - booktitle = {}, + booktitle = proc_ISRR, year = {2024}, abstract = {Modern motion planners for autonomous driving frequently use imitation learning (IL) to draw from expert driving logs. Although IL benefits from its ability to glean nuanced and multi-modal human driving behaviors from large datasets, the resulting planners often struggle with out-of-distribution (OOD) scenarios and with traffic rule compliance. On the other hand, classical rule-based planners, by design, can generate safe traffic rule compliant behaviors while being robust to OOD scenarios, but these planners fail to capture nuances in agent-to-agent interactions and human drivers' intent. RuleFuser, an evidential framework, combines IL planners with classical rule-based planners to draw on the complementary benefits of both, thereby striking a balance between imitation and safety. Our approach, tested on the real-world nuPlan dataset, combines the IL planner's high performance in in-distribution (ID) scenarios with the rule-based planners' enhanced safety in out-of-distribution (OOD) scenarios, achieving a 38.43% average improvement on safety metrics over the IL planner without much detriment to imitation metrics in OOD scenarios.}, - keywords = {sub}, + keywords = {press}, owner = {gammelli}, timestamp = {2024-09-19}, url = {https://arxiv.org/abs/2405.11139} @@ -1291,7 +1291,7 @@ @inproceedings{CaoIvanovicEtAl2024 @inproceedings{ChoIvanovicEtAl2024, author = {Cho, J. H. and Ivanovic, B. and Cao, Y. and Schmerling, E. and Wang, Y. and Weng, X. and Li, B. and You, Y. and Krähenbühl, P. and Wang, Y. and Pavone, M.}, title = {Language-Image Models with 3D Understanding}, - booktitle = {}, + booktitle = proc_NIPS, year = {2024}, abstract = {Multi-modal large language models (MLLMs) have shown incredible capabilities in a variety of 2D vision and language tasks. We extend MLLMs' perceptual capabilities to ground and reason about images in 3-dimensional space. To that end, we first develop a large-scale pre-training dataset for 2D and 3D called LV3D by combining multiple existing 2D and 3D recognition datasets under a common task formulation: as multi-turn question-answering. Next, we introduce a new MLLM named Cube-LLM and pre-train it on LV3D. We show that pure data scaling makes a strong 3D perception capability without 3D specific architectural design or training objective. Cube-LLM exhibits intriguing properties similar to LLMs: (1) Cube-LLM can apply chain-of-thought prompting to improve 3D understanding from 2D context information. (2) Cube-LLM can follow complex and diverse instructions and adapt to versatile input and output formats. (3) Cube-LLM can be visually prompted such as 2D box or a set of candidate 3D boxes from specialists. Our experiments on outdoor benchmarks demonstrate that Cube-LLM significantly outperforms existing baselines by 21.3 points of AP-BEV on the Talk2Car dataset for 3D grounded reasoning and 17.7 points on the DriveLM dataset for complex reasoning about driving scenarios, respectively. Cube-LLM also shows competitive results in general MLLM benchmarks such as refCOCO for 2D grounding with (87.0) average score, as well as visual question answering benchmarks such as VQAv2, GQA, SQA, POPE, etc. for complex reasoning. Our project is available at https://janghyuncho.github.io/Cube-LLM.}, keywords = {sub}, @@ -1303,7 +1303,7 @@ @inproceedings{ChoIvanovicEtAl2024 @inproceedings{FanCongEtAl2024, author = {Fan, Z. and Cong, W. and Wen, K. and Wang, K. and Zhang, J. and Ding, X. and Xu, D. and Ivanovic, B. and Pavone, M. and Pavlakos, G. and Wang, Z. and Wang, Y.}, title = {Instantsplat: Unbounded sparse-view pose-free gaussian splatting in 40 seconds}, - booktitle = {}, + booktitle = proc_ACM_SIGGRAPH_ASIA, year = {2024}, abstract = {While novel view synthesis (NVS) from a sparse set of images has advanced significantly in 3D computer vision, it relies on precise initial estimation of camera parameters using Structure-from-Motion (SfM). For instance, the recently developed Gaussian Splatting depends heavily on the accuracy of SfM-derived points and poses. However, SfM processes are time-consuming and often prove unreliable in sparse-view scenarios, where matched features are scarce, leading to accumulated errors and limited generalization capability across datasets. In this study, we introduce a novel and efficient framework to enhance robust NVS from sparse-view images. Our framework, InstantSplat, integrates multi-view stereo(MVS) predictions with point-based representations to construct 3D Gaussians of large-scale scenes from sparse-view data within seconds, addressing the aforementioned performance and efficiency issues by SfM. Specifically, InstantSplat generates densely populated surface points across all training views and determines the initial camera parameters using pixel-alignment. Nonetheless, the MVS points are not globally accurate, and the pixel-wise prediction from all views results in an excessive Gaussian number, yielding a overparameterized scene representation that compromises both training speed and accuracy. To address this issue, we employ a grid-based, confidence-aware Farthest Point Sampling to strategically position point primitives at representative locations in parallel. Next, we enhance pose accuracy and tune scene parameters through a gradient-based joint optimization framework from self-supervision. By employing this simplified framework, InstantSplat achieves a substantial reduction in training time, from hours to mere seconds, and demonstrates robust performance across various numbers of views in diverse datasets.}, keywords = {sub}, @@ -1353,8 +1353,8 @@ @inproceedings{TanIvanovicEtAl2024 @inproceedings{ChenYangEtAl2024, author = {Chen, Z. and Yang, J. and Huang, J. and Lutio, R. d. and Esturo, J. M. and Ivanovic, B. and Litany, O. and Gojcic, Z. and Fidler, S. and Pavone, M. and Song, L. and Wang, Y.}, title = {OmniRe: Omni Urban Scene Reconstruction}, - booktitle = {}, - year = {2024}, + booktitle = proc_ICLR, + year = {2025}, abstract = {We introduce OmniRe, a holistic approach for efficiently reconstructing high-fidelity dynamic urban scenes from on-device logs. Recent methods for modeling driving sequences using neural radiance fields or Gaussian Splatting have demonstrated the potential of reconstructing challenging dynamic scenes, but often overlook pedestrians and other non-vehicle dynamic actors, hindering a complete pipeline for dynamic urban scene reconstruction. To that end, we propose a comprehensive 3DGS framework for driving scenes, named OmniRe, that allows for accurate, full-length reconstruction of diverse dynamic objects in a driving log. OmniRe builds dynamic neural scene graphs based on Gaussian representations and constructs multiple local canonical spaces that model various dynamic actors, including vehicles, pedestrians, and cyclists, among many others. This capability is unmatched by existing methods. OmniRe allows us to holistically reconstruct different objects present in the scene, subsequently enabling the simulation of reconstructed scenarios with all actors participating in real-time (~60Hz). Extensive evaluations on the Waymo dataset show that our approach outperforms prior state-of-the-art methods quantitatively and qualitatively by a large margin. We believe our work fills a critical gap in driving reconstruction.}, keywords = {sub}, owner = {amine}, @@ -1365,8 +1365,8 @@ @inproceedings{ChenYangEtAl2024 @inproceedings{LiZhuEtAl2024, author = {Li, B. and Zhu, L. and Tian, R. and Tan, S. and Chen, Y. and Lu, Y. and Cui, Y. and Veer, S. and Ehrlich, M. and Philion, J. and Weng, X. and Xue, F. and Tao, A. and Liu, M. Y. and Fidler, S. and Ivanovic, B. and Darrell, T. and Malik, J. and Han, S. and Pavone, M.}, title = {Wolf: Captioning Everthing with a World Summarization Framework}, - booktitle = {}, - year = {2024}, + booktitle = proc_ICLR, + year = {2025}, abstract = {We propose Wolf, a WOrLd summarization Framework for accurate video captioning. Wolf is an automated captioning framework that adopts a mixture-of-experts approach, leveraging complementary strengths of Vision Language Models (VLMs). By utilizing both image and video models, our framework captures different levels of information and summarizes them efficiently. Our approach can be applied to enhance video understanding, auto-labeling, and captioning. To evaluate caption quality, we introduce CapScore, an LLM-based metric to assess the similarity and quality of generated captions compared to the ground truth captions. We further build four human-annotated datasets in three domains: autonomous driving, general scenes, and robotics, to facilitate comprehensive comparisons. We show that Wolf achieves superior captioning performance compared to state-of-the-art approaches from the research community (VILA1.5, CogAgent) and commercial solutions (Gemini-Pro-1.5, GPT-4V). For instance, in comparison with GPT-4V, Wolf improves CapScore both quality-wise by 55.6% and similarity-wise by 77.4% on challenging driving videos. Finally, we establish a benchmark for video captioning and introduce a leaderboard, aiming to accelerate advancements in video understanding, captioning, and data alignment. Leaderboard: https://wolfv0.github.io/leaderboard.html.}, keywords = {sub}, owner = {amine}, @@ -1377,10 +1377,10 @@ @inproceedings{LiZhuEtAl2024 @inproceedings{FangZhuEtAl2024, author = {Fang, Y. and Zhu, L. and Lu, Y. and Wang, Y. and Molchanov, P. and Cho, J. H. and Pavone, M. and Han, S. and Yin, H.}, title = {$VILA^2$: VILA Augmented VILA}, - booktitle = {}, + booktitle = proc_NIPS, year = {2024}, abstract = {Visual language models (VLMs) have rapidly progressed, driven by the success of large language models (LLMs). While model architectures and training infrastructures advance rapidly, data curation remains under-explored. When data quantity and quality become a bottleneck, existing work either directly crawls more raw data from the Internet that does not have a guarantee of data quality or distills from black-box commercial models (e.g., GPT-4V / Gemini) causing the performance upper bounded by that model. In this work, we introduce a novel approach that includes a self-augment step and a specialist-augment step to iteratively improve data quality and model performance. In the self-augment step, a VLM recaptions its own pretraining data to enhance data quality, and then retrains from scratch using this refined dataset to improve model performance. This process can iterate for several rounds. Once self-augmentation saturates, we employ several specialist VLMs finetuned from the self-augmented VLM with domain-specific expertise, to further infuse specialist knowledge into the generalist VLM through task-oriented recaptioning and retraining. With the combined self-augmented and specialist-augmented training, we introduce $VILA^2$ (VILA-augmented-VILA), a VLM family that consistently improves the accuracy on a wide range of tasks over prior art, and achieves new state-of-the-art results on MMMU leaderboard among open-sourced models.}, - keywords = {sub}, + keywords = {press}, owner = {amine}, timestamp = {2024-09-19}, url = {https://arxiv.org/abs/2407.17453} @@ -1389,10 +1389,10 @@ @inproceedings{FangZhuEtAl2024 @inproceedings{GuSongEtAl2024, author = {Gu, X. and Song, G. and Gilitschenski, I. and Pavone, M. and Ivanovic, B.}, title = {Accelerating Online Mapping and Behavior Prediction via Direct BEV Feature Attention}, - booktitle = {}, + booktitle = proc_ECCV, year = {2024}, abstract = {Understanding road geometry is a critical component of the autonomous vehicle (AV) stack. While high-definition (HD) maps can readily provide such information, they suffer from high labeling and maintenance costs. Accordingly, many recent works have proposed methods for estimating HD maps online from sensor data. The vast majority of recent approaches encode multi-camera observations into an intermediate representation, e.g., a bird's eye view (BEV) grid, and produce vector map elements via a decoder. While this architecture is performant, it decimates much of the information encoded in the intermediate representation, preventing downstream tasks (e.g., behavior prediction) from leveraging them. In this work, we propose exposing the rich internal features of online map estimation methods and show how they enable more tightly integrating online mapping with trajectory forecasting. In doing so, we find that directly accessing internal BEV features yields up to 73\% faster inference speeds and up to 29\% more accurate predictions on the real-world nuScenes dataset.}, - keywords = {sub}, + keywords = {pub}, owner = {amine}, timestamp = {2024-09-19}, url = {https://arxiv.org/abs/2407.06683} @@ -1413,10 +1413,10 @@ @inproceedings{TianLiEtAl2024 @inproceedings{DaunerHallgartenEtAl2024, author = {Dauner, D. and Hallgarten, M. and Li, T. and Weng, X. and Huang, Z. and Yang, Z. and Li, H. and Gilitschenski, I. and Ivanovic, B. and Pavone, M. and Geiger, A. and Chitta, K.}, title = {NAVSIM: Data-Driven Non-Reactive Autonomous Vehicle Simulation and Benchmarking}, - booktitle = {}, + booktitle = proc_NIPS, year = {2024}, abstract = {Benchmarking vision-based driving policies is challenging. On one hand, open-loop evaluation with real data is easy, but these results do not reflect closed-loop performance. On the other, closed-loop evaluation is possible in simulation, but is hard to scale due to its significant computational demands. Further, the simulators available today exhibit a large domain gap to real data. This has resulted in an inability to draw clear conclusions from the rapidly growing body of research on end-to-end autonomous driving. In this paper, we present NAVSIM, a middle ground between these evaluation paradigms, where we use large datasets in combination with a non-reactive simulator to enable large-scale real-world benchmarking. Specifically, we gather simulation-based metrics, such as progress and time to collision, by unrolling bird's eye view abstractions of the test scenes for a short simulation horizon. Our simulation is non-reactive, i.e., the evaluated policy and environment do not influence each other. As we demonstrate empirically, this decoupling allows open-loop metric computation while being better aligned with closed-loop evaluations than traditional displacement errors. NAVSIM enabled a new competition held at CVPR 2024, where 143 teams submitted 463 entries, resulting in several new insights. On a large set of challenging scenarios, we observe that simple methods with moderate compute requirements such as TransFuser can match recent large-scale end-to-end driving architectures such as UniAD. Our modular framework can potentially be extended with new datasets, data curation strategies, and metrics, and will be continually maintained to host future challenges. Our code is available at \url{https://github.com/autonomousvision/navsim}.}, - keywords = {sub}, + keywords = {press}, owner = {amine}, timestamp = {2024-09-19}, url = {https://arxiv.org/abs/2406.15349} @@ -1425,10 +1425,10 @@ @inproceedings{DaunerHallgartenEtAl2024 @inproceedings{WangKimEtAl2024, author = {Wang, L. and Kim, S. W. and Yang, J. and Yu, C. and Ivanovic, B. and Waslander, S. L. and Wang, Y. and Fidler, S. and Pavone, M. and Karkus, P.}, title = {DistillNeRF: Perceiving 3D Scenes from Single-Glance Images by Distilling Neural Fields and Foundation Model Features}, - booktitle = {}, + booktitle = proc_NIPS, year = {2024}, abstract = {We propose DistillNeRF, a self-supervised learning framework addressing the challenge of understanding 3D environments from limited 2D observations in autonomous driving. Our method is a generalizable feedforward model that predicts a rich neural scene representation from sparse, single-frame multi-view camera inputs, and is trained self-supervised with differentiable rendering to reconstruct RGB, depth, or feature images. Our first insight is to exploit per-scene optimized Neural Radiance Fields (NeRFs) by generating dense depth and virtual camera targets for training, thereby helping our model to learn 3D geometry from sparse non-overlapping image inputs. Second, to learn a semantically rich 3D representation, we propose distilling features from pre-trained 2D foundation models, such as CLIP or DINOv2, thereby enabling various downstream tasks without the need for costly 3D human annotations. To leverage these two insights, we introduce a novel model architecture with a two-stage lift-splat-shoot encoder and a parameterized sparse hierarchical voxel representation. Experimental results on the NuScenes dataset demonstrate that DistillNeRF significantly outperforms existing comparable self-supervised methods for scene reconstruction, novel view synthesis, and depth estimation; and it allows for competitive zero-shot 3D semantic occupancy prediction, as well as open-world scene understanding through distilled foundation model features. Demos and code will be available at https://distillnerf.github.io/.}, - keywords = {sub}, + keywords = {press}, owner = {amine}, timestamp = {2024-09-19}, url = {https://arxiv.org/abs/2406.12095} @@ -1437,7 +1437,7 @@ @inproceedings{WangKimEtAl2024 @inproceedings{FanWangEtAl2024, author = {Fan, Z. and Wang, P. and Zhao, Y. and Zhao, Y. and Ivanovic, B. and Wang, Z. and Pavone, M. and Yang, H. F.}, title = {Learning Traffic Crashes as Language: Datasets, Benchmarks, and What-if Causal Analyses}, - booktitle = {}, + booktitle = proc_NIPS, year = {2024}, abstract = {The increasing rate of road accidents worldwide results not only in significant loss of life but also imposes billions financial burdens on societies. Current research in traffic crash frequency modeling and analysis has predominantly approached the problem as classification tasks, focusing mainly on learning-based classification or ensemble learning methods. These approaches often overlook the intricate relationships among the complex infrastructure, environmental, human and contextual factors related to traffic crashes and risky situations. In contrast, we initially propose a large-scale traffic crash language dataset, named CrashEvent, summarizing 19,340 real-world crash reports and incorporating infrastructure data, environmental and traffic textual and visual information in Washington State. Leveraging this rich dataset, we further formulate the crash event feature learning as a novel text reasoning problem and further fine-tune various large language models (LLMs) to predict detailed accident outcomes, such as crash types, severity and number of injuries, based on contextual and environmental factors. The proposed model, CrashLLM, distinguishes itself from existing solutions by leveraging the inherent text reasoning capabilities of LLMs to parse and learn from complex, unstructured data, thereby enabling a more nuanced analysis of contributing factors. Our experiments results shows that our LLM-based approach not only predicts the severity of accidents but also classifies different types of accidents and predicts injury outcomes, all with averaged F1 score boosted from 34.9% to 53.8%. Furthermore, CrashLLM can provide valuable insights for numerous open-world what-if situational-awareness traffic safety analyses with learned reasoning features, which existing models cannot offer. We make our benchmark, datasets, and model public available for further exploration.}, keywords = {sub}, @@ -1449,10 +1449,10 @@ @inproceedings{FanWangEtAl2024 @inproceedings{LiWangEtAl2024, author = {Li, Y. and Wang, Z. and Wang, Y. and Yu, Z. and Gojcic, Z. and Pavone, M. and Feng, C. and Alvarez, J. M.}, title = {Memorize What Matters: Emergent Scene Decomposition from Multitraverse}, - booktitle = {}, + booktitle = proc_NIPS, year = {2024}, abstract = {Humans naturally retain memories of permanent elements, while ephemeral moments often slip through the cracks of memory. This selective retention is crucial for robotic perception, localization, and mapping. To endow robots with this capability, we introduce 3D Gaussian Mapping (3DGM), a self-supervised, camera-only offline mapping framework grounded in 3D Gaussian Splatting. 3DGM converts multitraverse RGB videos from the same region into a Gaussian-based environmental map while concurrently performing 2D ephemeral object segmentation. Our key observation is that the environment remains consistent across traversals, while objects frequently change. This allows us to exploit self-supervision from repeated traversals to achieve environment-object decomposition. More specifically, 3DGM formulates multitraverse environmental mapping as a robust differentiable rendering problem, treating pixels of the environment and objects as inliers and outliers, respectively. Using robust feature distillation, feature residuals mining, and robust optimization, 3DGM jointly performs 3D mapping and 2D segmentation without human intervention. We build the Mapverse benchmark, sourced from the Ithaca365 and nuPlan datasets, to evaluate our method in unsupervised 2D segmentation, 3D reconstruction, and neural rendering. Extensive results verify the effectiveness and potential of our method for self-driving and robotics.}, - keywords = {sub}, + keywords = {press}, owner = {amine}, timestamp = {2024-09-19}, url = {https://arxiv.org/abs/2405.17187}